In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import shap
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
color_pal = sns.color_palette()
#import utils
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', 1000)

In [None]:
df = pd.read_csv('/kaggle/input/global-daily-climate-data/weather.csv').drop_duplicates()
df_countries = pd.read_csv("/kaggle/input/global-daily-climate-data/countries.csv").drop_duplicates()
df = df.set_index("country").join(df_countries.set_index("country")[['region']], how='left')
df = df[df['avg_temp_c']>-273]
print(df.shape)
df.sample(5).T

# Data Exploration

In [None]:
df.head()

In [None]:
df.shape

In [None]:
cols = df.columns
cols

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.dtypes

In [None]:
unique_values =  df.nunique()
unique_values

In [None]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")

# null values

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind ='bar')

In [None]:
# Drop specified columns from the DataFrame 
df.drop(columns=['snow_depth_mm', 'avg_wind_dir_deg','avg_wind_speed_kmh',
                  'peak_wind_gust_kmh', 'avg_sea_level_pres_hpa','sunshine_total_min'] ,inplace=True )

In [None]:
# Fill missing values in 'min_temp_c' column with the mean of non-missing values
df['min_temp_c'] = df['min_temp_c'].fillna(df['min_temp_c'].mean())

# Fill missing values in 'max_temp_c' column with the median of non-missing values
df['max_temp_c'] = df['max_temp_c'].fillna(df['max_temp_c'].median())

# Drop rows where the 'region' column has missing values
df = df.dropna(subset=['region','precipitation_mm'])

# Check the count of missing values in each column
missing_value_counts = df.isna().sum()

# Print the counts of missing values
print("Count of missing values in each column:")
print(missing_value_counts)
print(df.shape)

# Data visualisation

In [None]:
season_distribution = df['season'].value_counts()

# Plot the distribution
plt.figure(figsize=(10, 6))
season_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of Seasons')
plt.xlabel('Season')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
df['season'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightcoral','r','b'])
plt.title('season percentage')
plt.show()

In [None]:
df

In [None]:
plt.figure(figsize=(10,10))
Corr_Matrix=df.corr()
sns.heatmap(df.corr(), annot=True, fmt=".2f")
plt.show()

In [None]:
print('Top 5 Most Positively Correlated to the Target Variable')
Corr_Matrix['avg_temp_c'].sort_values(ascending=False).head(5)

In [None]:
print('Top 5 Most Negatively Correlated to the Target Variable')
Corr_Matrix['avg_temp_c'].sort_values(ascending=True).head(5)

In [None]:
top_10_Value_avg_temp_c = df.nlargest(10, 'avg_temp_c')
top_10_Value_avg_temp_c

In [None]:
# Visualize the relationship between average temperature and season
plt.figure(figsize=(10, 6))
sns.boxplot(x='season', y='avg_temp_c', data=df)
plt.title('Average Temperature Distribution by Season')
plt.show()

In [None]:
# Visualize the relationship between average temperature and season
plt.figure(figsize=(10, 6))
sns.boxplot(x='season', y='max_temp_c', data=df)
plt.title('max Temperature Distribution by Season')
plt.show()

In [None]:
# Visualize the relationship between average temperature and season
plt.figure(figsize=(10, 6))
sns.boxplot(x='season', y='min_temp_c', data=df)
plt.title('min Temperature Distribution by Season')
plt.show()

In [None]:
df.groupby(['country','capital']).median()[['avg_temp_c']].sort_values('avg_temp_c', ascending = False).head().plot(kind='barh', color = 'teal', grid = True, figsize = (8,2))
plt.xlabel('Average Temperature (°C)')
plt.ylabel('Location')
plt.title('Top 5 Hottest Cities in the World')
plt.show()

In [None]:
sns.distplot(x =df['avg_temp_c'])

# Time Series

In [None]:
# Create a new DataFrame to store selected columns
date_df = pd.DataFrame()

# Copy the 'date', 'season', and 'avg_temp_c' columns from the original DataFrame to the new DataFrame
date_df['datetime'] = df['date']         # Copy 'date' column
date_df['season'] = df['season']         # Copy 'season' column
date_df['avg_temp_c'] = df['avg_temp_c'] # Copy 'avg_temp_c' column

# Convert the 'datetime' column to datetime format
date_df['datetime'] = pd.to_datetime(date_df['datetime'])

# Set the 'datetime' column as the index of the DataFrame
date_df = date_df.set_index('datetime')

In [None]:
# Filter 'date_df' for rows where the 'season' column is 'summer'
summer_df = date_df[date_df['season'] == 'summer']

# Filter 'date_df' for rows where the 'season' column is 'autumn'
autumn_df = date_df[date_df['season'] == 'autumn']

# Filter 'date_df' for rows where the 'season' column is 'winter'
winter_df = date_df[date_df['season'] == 'winter']

# Filter 'date_df' for rows where the 'season' column is 'spring'
spring_df = date_df[date_df['season'] == 'spring']

In [None]:
summer_df

In [None]:
summer_df.plot(style='.',
        figsize=(15, 5),
        color=color_pal[0],
        title='avg_temp_c in summer season')
plt.show()

In [None]:
autumn_df.plot(style='.',
        figsize=(15, 5),
        color=color_pal[1],
        title='avg_temp_c in autumn season')
plt.show()

In [None]:
winter_df.plot(style='.',
        figsize=(15, 5),
        color=color_pal[2],
        title='avg_temp_c in winter season')
plt.show()

In [None]:
spring_df.plot(style='.',
        figsize=(15, 5),
        color=color_pal[3],
        title='avg_temp_c in spring season')
plt.show()

In [None]:
def create_features(date_df):
    """
    Create time series features based on time series index.
    """
    date_df = date_df.copy()
    date_df['dayofweek'] = date_df.index.dayofweek
    date_df['quarter'] = date_df.index.quarter
    date_df['month'] = date_df.index.month
    date_df['year'] = date_df.index.year
    return date_df

date_df = create_features(date_df)

In [None]:
date_df

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=date_df, x='quarter', y='avg_temp_c')
ax.set_title('avg_temp_c by quarter')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=date_df, x='month', y='avg_temp_c')
ax.set_title('avg_temp_c by month')
plt.show()

# Categorical

In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [None]:
df.dtypes

In [None]:
# One-hot encode the 'season' column
df = pd.get_dummies(df, columns=['season'], drop_first=True)

# Display the first few rows of the encoded dataset
df.head()

In [None]:
# Transform the 'region' column using label encoding
df['region'] = le.fit_transform(df['region'])

# Transform the 'capital' column using label encoding
df['capital'] = le.fit_transform(df['capital'])

df.head()

# model

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# spliting the dataset

In [None]:
X = df.drop(columns=['avg_temp_c',"date"])
y = df['avg_temp_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Model Building and Analysis


In [None]:
models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual avg_temp_c'] = y_test
    submit['Predict_avg_temp_c'] = y_pred
    submit = submit.reset_index()
    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(submit.head(5))

In [None]:
import statsmodels.api as sm

X = df.drop(columns=['avg_temp_c',"date"])
y = df['avg_temp_c']

def forward_selection(df, target, significance_level=0.05):
    initial_features = df.columns.tolist()
    best_features = []
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(df[best_features + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if min_p_value < significance_level:
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

# Assuming you have already defined X and y as the features and target variable respectively
selected_features = forward_selection(X, y)
print("Selected features:", selected_features)

# feature_importances


In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# visualisat the model 

In [None]:
y_pred= model.predict(X_test)

# Residuals
residuals = y_test - y_pred

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()