In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q feature_engine autoviz dataprep

In [None]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings 
warnings.filterwarnings("ignore")

from autoviz import AutoViz_Class
from dataprep.datasets import load_dataset
from dataprep.eda import create_report

import shap
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import re

pd.set_option('display.max_rows', 1000)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='colorblind')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [None]:
%%time
df = pd.read_csv('/kaggle/input/global-daily-climate-data/weather.csv').drop_duplicates()
df_countries = pd.read_csv("/kaggle/input/global-daily-climate-data/countries.csv").drop_duplicates()
df = df.set_index("country").join(df_countries.set_index("country")[['region']], how='left')
# convert date to datetime
'''for col in ['date']:
    df[col] = pd.to_datetime(df[col], format="%Y-%m-%d")'''
# select only records with valid average temperature
df = df[df['avg_temp_c']>-273]
print(df.shape)
df.sample(5).T

In [None]:
df.head()

In [None]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]
# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]
# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

In [None]:
df.isna().sum()

# #Data visualisation

In [None]:
df.isna().sum().plot(kind = 'bar')

In [None]:
import missingno as msno

msno.matrix(df)

In [None]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")

In [None]:
df = df.drop(columns=['snow_depth_mm','avg_wind_dir_deg','peak_wind_gust_kmh','avg_sea_level_pres_hpa','sunshine_total_min','avg_wind_speed_kmh'], axis=1)

In [None]:
df.isna().sum()

In [None]:
df['min_temp_c']=df['min_temp_c'].fillna(df['min_temp_c'].mean())
df['max_temp_c']=df['max_temp_c'].fillna(df['max_temp_c'].median())

In [None]:
df = df.dropna(subset=['region','precipitation_mm'])

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
season_distribution = df['season'].value_counts()

# Plot the distribution
plt.figure(figsize=(10, 6))
season_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of Seasons')
plt.xlabel('Season')
plt.ylabel('Count')
plt.show()

In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
# Visualize the relationship between average temperature and season
plt.figure(figsize=(10, 6))
sns.boxplot(x='season', y='avg_temp_c', data=df)
plt.title('Average Temperature Distribution by Season')
plt.show()

In [None]:
sns.catplot(data=df, y="avg_temp_c",  kind="box")

In [None]:
df.groupby(['country','capital']).median()[['avg_temp_c']].sort_values('avg_temp_c', ascending = False).head().plot(kind='barh', color = 'teal', grid = True, figsize = (8,2))
plt.xlabel('Average Temperature (°C)')
plt.ylabel('Location')
plt.title('Top 5 Hottest Cities in the World')
plt.show()

In [None]:
sns.scatterplot(data=df, x="season", y="avg_temp_c")


In [None]:
sns.histplot(df, x="season")

In [None]:
# An update taken from the nice work https://www.kaggle.com/code/anshtanwar/auto-eda-missing-migrants-interactive-charts 
# made by @anshtanwar
AV = AutoViz_Class()
filename = ""
target_variable = 'avg_temp_c'
custom_plot_dir = "custom_plot_directory"

dft = AV.AutoViz(
    filename,
    sep=",",
    depVar=target_variable,
    dfte=df,
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=min([df.shape[0], 10**3]),
    max_cols_analyzed=min([df.shape[1], 50]),
    save_plot_dir=custom_plot_dir
)

In [None]:
from IPython.core.display import display, HTML

# Define the list of file names
from pathlib import Path
file_names = []
for file in Path(f'/kaggle/working/{custom_plot_dir}/{target_variable}/').glob('*.html'):
    filename = str(file).split('/')[-1]
    file_names.append(filename)

# Loop through the list and display each HTML file
for file_name in file_names:
    file_path = f'/kaggle/working/{custom_plot_dir}/{target_variable}/{file_name}'
    with open(file_path, 'r') as file:
        html_content = file.read()
        display(HTML(html_content))

In [None]:
create_report(df)

# categorical and drop null

In [None]:
df.dtypes

In [None]:
df['capital'] = le.fit_transform(df['capital'])
df['region'] = le.fit_transform(df['region'])
df = pd.get_dummies(df, columns=['season'], drop_first=True)

In [None]:
df['decade'] = df['date'].dt.year.apply(lambda x: str(x//10)+'0s')

In [None]:
df['decade'] = le.fit_transform(df['decade'])

In [None]:
df.dtypes

In [None]:
df.isna().sum()

# Top 5 Most Positively Correlated

In [None]:
print('Top 5 Most Positively Correlated to the Target Variable')
Corr_Matrix['avg_temp_c'].sort_values(ascending=False).head(5)

# Top 5 Most Negatively Correlated

In [None]:
print('Top 5 Most Negatively Correlated to the Target Variable')
Corr_Matrix['avg_temp_c'].sort_values(ascending=True).head(5)

# Model 

In [None]:
X = df.drop(columns=['avg_temp_c','date','decade'])
y = df['avg_temp_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
 models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual avg_temp_c'] = y_test
    submit['Predict_avg_temp_c'] = y_pred
    submit = submit.reset_index()
    r2 = r2_score(y_test, y_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(submit.head(5))
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# feature_importances

In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()