In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q feature_engine autoviz dataprep

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings 
warnings.filterwarnings("ignore")

from autoviz import AutoViz_Class
from dataprep.datasets import load_dataset
from dataprep.eda import create_report

import shap
import matplotlib.pyplot as plt
import seaborn as sns 
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import re

pd.set_option('display.max_rows', 1000)

In [None]:
df = pd.read_csv('/kaggle/input/global-daily-climate-data/weather.csv').drop_duplicates()
df_countries = pd.read_csv("/kaggle/input/global-daily-climate-data/countries.csv").drop_duplicates()
df = df.set_index("country").join(df_countries.set_index("country")[['region']], how='left')
df = df[df['avg_temp_c']>-273]
print(df.shape)
df.sample(5).T

# Data Exploration

In [None]:
df.head()

In [None]:
df.shape

In [None]:
cols = df.columns
cols

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.dtypes

In [None]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")

# Data visualisation

In [None]:
AV = AutoViz_Class()
filename = ""
target_variable = 'avg_temp_c'
custom_plot_dir = "custom_plot_directory"

dft = AV.AutoViz(
    filename,
    sep=",",
    depVar=target_variable,
    dfte=df,
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=min([df.shape[0], 10**3]),
    max_cols_analyzed=min([df.shape[1], 50]),
    save_plot_dir=custom_plot_dir
)

In [None]:
from IPython.core.display import display, HTML

# Define the list of file names
from pathlib import Path
file_names = []
for file in Path(f'/kaggle/working/{custom_plot_dir}/{target_variable}/').glob('*.html'):
    filename = str(file).split('/')[-1]
    file_names.append(filename)

# Loop through the list and display each HTML file
for file_name in file_names:
    file_path = f'/kaggle/working/{custom_plot_dir}/{target_variable}/{file_name}'
    with open(file_path, 'r') as file:
        html_content = file.read()
        display(HTML(html_content))

In [None]:
create_report(df)

# null values 

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind ='bar')

In [None]:
 df.drop(columns=['snow_depth_mm', 'avg_wind_dir_deg','avg_wind_speed_kmh',
                  'peak_wind_gust_kmh', 'avg_sea_level_pres_hpa','sunshine_total_min'] ,inplace=True )

In [None]:
# Fill missing values in 'min_temp_c' column with the mean of non-missing values
df['min_temp_c'] = df['min_temp_c'].fillna(df['min_temp_c'].mean())

# Fill missing values in 'max_temp_c' column with the median of non-missing values
df['max_temp_c'] = df['max_temp_c'].fillna(df['max_temp_c'].median())

# Drop rows where the 'region' column has missing values
df = df.dropna(subset=['region','precipitation_mm'])

# Check the count of missing values in each column
missing_value_counts = df.isna().sum()

# Print the counts of missing values
print("Count of missing values in each column:")
print(missing_value_counts)
print(df.shape)

# Categorical

In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()


In [None]:
df.dtypes

In [None]:
# One-hot encode the 'season' column
df = pd.get_dummies(df, columns=['season'], drop_first=True)

# Display the first few rows of the encoded dataset
df.head()

In [None]:
# Transform the 'region' column using label encoding
df['region'] = le.fit_transform(df['region'])

# Transform the 'capital' column using label encoding
df['capital'] = le.fit_transform(df['capital'])

df.head()

# model 

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Sampling the dataset¶


In [None]:
X = df.drop(columns=['avg_temp_c',"date"])
y = df['avg_temp_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Model Building and Analysis

In [None]:
models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual avg_temp_c'] = y_test
    submit['Predict_avg_temp_c'] = y_pred
    submit = submit.reset_index()
    r2 = r2_score(y_test, y_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(submit.head(5))

    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# feature_importances

In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()