In [247]:
#Import Dependencies 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


Task Phases for EDA using Python: 
1. Data Preprocessing: 

• Load the Data: 


In [248]:
# Load the dataset into a pandas DataFrame
df = pd.read_csv('Global_Temp.csv')

Initial Data Inspection


In [249]:
# Inspect first rows of the DataFrame
print(df.head())

                                                                                          Land-Ocean: Global Means
Year Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec  J-D  D-N  DJF  MAM  JJA                       SON
1880 -.20 -.26 -.09 -.17 -.10 -.22 -.21 -.11 -.16 -.23 -.23 -.19 -.18 ***  ***  -.12 -.18                     -.21
1881 -.20 -.16 .02  .03  .06  -.19 .00  -.05 -.16 -.22 -.19 -.08 -.10 -.11 -.18 .04  -.08                     -.19
1882 .15  .13  .04  -.17 -.14 -.23 -.17 -.08 -.15 -.24 -.17 -.37 -.12 -.09 .07  -.09 -.16                     -.19
1883 -.30 -.37 -.13 -.19 -.18 -.08 -.08 -.15 -.23 -.12 -.24 -.12 -.18 -.20 -.35 -.17 -.10                     -.20


Check the shape & data types


In [250]:
print(df.shape)
print(df.dtypes)

(146, 1)
Land-Ocean: Global Means    object
dtype: object


• Clean the Data:



In [251]:
# Reload the dataset with proper parsing to handle spacing issues
df = pd.read_csv('Global_Temp.csv', skiprows=1)  # Skipping the first row as it might be incorrectly formatted


In [252]:
# Rename columns for easier access
df.columns = ["Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
              "J-D", "D-N", "DJF", "MAM", "JJA", "SON"]



In [253]:
# Ensure the 'Year' column is in integer format
df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")

# Convert all temperature anomaly columns to numeric, handling non-numeric values
for col in df.columns[1:]:  # Excluding 'Year'
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Check for missing values
missing_values = df.isnull().sum()



In [254]:
# Fill missing values with the column mean
df.fillna(df.mean(), inplace=True)

# Verify the data types and missing values after cleaning
df_info = df.info()

# Display missing values count
missing_values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    145 non-null    Int64  
 1   Jan     145 non-null    float64
 2   Feb     145 non-null    float64
 3   Mar     145 non-null    float64
 4   Apr     145 non-null    float64
 5   May     145 non-null    float64
 6   Jun     145 non-null    float64
 7   Jul     145 non-null    float64
 8   Aug     145 non-null    float64
 9   Sep     145 non-null    float64
 10  Oct     145 non-null    float64
 11  Nov     145 non-null    float64
 12  Dec     145 non-null    float64
 13  J-D     145 non-null    float64
 14  D-N     145 non-null    float64
 15  DJF     145 non-null    float64
 16  MAM     145 non-null    float64
 17  JJA     145 non-null    float64
 18  SON     145 non-null    float64
dtypes: Int64(1), float64(18)
memory usage: 21.8 KB


Year    0
Jan     0
Feb     0
Mar     0
Apr     0
May     0
Jun     0
Jul     0
Aug     0
Sep     0
Oct     0
Nov     0
Dec     0
J-D     0
D-N     1
DJF     1
MAM     0
JJA     0
SON     0
dtype: int64

In [255]:
# Check if the data now has the correct structure
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    145 non-null    Int64  
 1   Jan     145 non-null    float64
 2   Feb     145 non-null    float64
 3   Mar     145 non-null    float64
 4   Apr     145 non-null    float64
 5   May     145 non-null    float64
 6   Jun     145 non-null    float64
 7   Jul     145 non-null    float64
 8   Aug     145 non-null    float64
 9   Sep     145 non-null    float64
 10  Oct     145 non-null    float64
 11  Nov     145 non-null    float64
 12  Dec     145 non-null    float64
 13  J-D     145 non-null    float64
 14  D-N     145 non-null    float64
 15  DJF     145 non-null    float64
 16  MAM     145 non-null    float64
 17  JJA     145 non-null    float64
 18  SON     145 non-null    float64
dtypes: Int64(1), float64(18)
memory usage: 21.8 KB


(None,
    Year   Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov  \
 0  1880 -0.20 -0.26 -0.09 -0.17 -0.10 -0.22 -0.21 -0.11 -0.16 -0.23 -0.23   
 1  1881 -0.20 -0.16  0.02  0.03  0.06 -0.19  0.00 -0.05 -0.16 -0.22 -0.19   
 2  1882  0.15  0.13  0.04 -0.17 -0.14 -0.23 -0.17 -0.08 -0.15 -0.24 -0.17   
 3  1883 -0.30 -0.37 -0.13 -0.19 -0.18 -0.08 -0.08 -0.15 -0.23 -0.12 -0.24   
 4  1884 -0.13 -0.09 -0.37 -0.41 -0.34 -0.35 -0.31 -0.28 -0.28 -0.25 -0.34   
 
     Dec   J-D       D-N       DJF   MAM   JJA   SON  
 0 -0.19 -0.18  0.074444  0.068889 -0.12 -0.18 -0.21  
 1 -0.08 -0.10 -0.110000 -0.180000  0.04 -0.08 -0.19  
 2 -0.37 -0.12 -0.090000  0.070000 -0.09 -0.16 -0.19  
 3 -0.12 -0.18 -0.200000 -0.350000 -0.17 -0.10 -0.20  
 4 -0.31 -0.29 -0.270000 -0.110000 -0.37 -0.32 -0.29  )

2. Exploratory Data Analysis (EDA): 


In [256]:
# Display basic statistics
print("\nDescriptive Statistics:")
print(df.describe())


Descriptive Statistics:
            Year         Jan         Feb         Mar         Apr         May  \
count      145.0  145.000000  145.000000  145.000000  145.000000  145.000000   
mean      1952.0    0.068690    0.077172    0.094828    0.068690    0.057862   
std    42.001984    0.434603    0.441759    0.446745    0.408954    0.389138   
min       1880.0   -0.810000   -0.630000   -0.630000   -0.600000   -0.550000   
25%       1916.0   -0.250000   -0.240000   -0.230000   -0.250000   -0.240000   
50%       1952.0   -0.010000   -0.040000    0.010000   -0.030000   -0.040000   
75%       1988.0    0.320000    0.390000    0.320000    0.290000    0.280000   
max       2024.0    1.240000    1.440000    1.390000    1.310000    1.160000   

              Jun         Jul         Aug         Sep         Oct         Nov  \
count  145.000000  145.000000  145.000000  145.000000  145.000000  145.000000   
mean     0.045724    0.070276    0.068552    0.073793    0.099655    0.091655   
std      0.

In [257]:

# Identify unusually high or low temperature anomalies for all columns using IQR method

outliers_dict = {}

for column in df.columns[1:]:  # Excluding "Year" column
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    # Define lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    



In [258]:
    # Filter outliers for each column
outliers_dict[column] = df[(df[column] < lower_bound) | (df[column] > upper_bound)][["Year", column]]

# Combine all detected outliers into a single DataFrame
outliers_combined = pd.concat(outliers_dict.values(), axis=0).drop_duplicates()
print(outliers_combined)


     Year   SON
135  2015  0.99
143  2023  1.41
144  2024  1.29


Data Cleaning