# Importing Libraries

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from persiantools.digits import fa_to_en
from persiantools.jdatetime import JalaliDateTime, JalaliDate
from datetime import datetime, date

# Data Preparation

## Data Loading

In [50]:
# City name lists
cities = [
    "Ahvaz",
    "Bandar Abbas",
    "Isfahan",
    "Karaj",
    "Kerman",
    "Mashhad",
    "Rasht",
    "Shiraz",
    "Tabriz",
    "Tehran"
]

# Dataframe to keep dataframes
dataframes = {}

# Reading DataFrames and Saving them in a dictionary
for city in cities:
    dataframes[city] = pd.read_csv(f"Dataset/{city}.csv")

# for city, df in dataframes.items():
#     print(f"Dataframe for {city}:")
#     print(f"Shape: {df.shape}")
#     print()
# print(dataframes)

## Data Cleaning 

### Checking NaNs 

In [51]:
print("MISSING VALUES' SUM :")
for city, df in dataframes.items():
    missing_values = df.isnull().sum()
    print(f"- {city}:", missing_values.values.sum())

MISSING VALUES' SUM :
- Ahvaz: 0
- Bandar Abbas: 0
- Isfahan: 0
- Karaj: 0
- Kerman: 0
- Mashhad: 0
- Rasht: 0
- Shiraz: 0
- Tabriz: 0
- Tehran: 0


### Setting Date Column's Datatype 

In [52]:
# Fixing the date data type
for city, df in dataframes.items():
    df["time"] = pd.to_datetime(df["time"])

### Check Datatypes 

In [53]:
dataframes["Tehran"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   time                             4926 non-null   datetime64[ns]
 1   temp_max (°C)                    4926 non-null   float64       
 2   temp_min (°C)                    4926 non-null   float64       
 3   temp_mean (°C)                   4926 non-null   float64       
 4   daylight_duration (s)            4926 non-null   float64       
 5   precipitation_sum (mm)           4926 non-null   float64       
 6   rain_sum (mm)                    4926 non-null   float64       
 7   snowfall_sum (cm)                4926 non-null   float64       
 8   precipitation_hours (h)          4926 non-null   float64       
 9   wind_speed_max (km/h)            4926 non-null   float64       
 10  wind_gusts_max (km/h)            4926 non-null   float64    

**Note:** all the dataframes were checked but to show a clean output for presentation we only showed the data of Tehran result

### Gregorian to Jalali Convertion 

In [54]:
for city, df in dataframes.items():
    df["date_jalali"] = df["time"].apply(lambda time: JalaliDate(time))
    df.rename(columns={'time': 'date_gregorian'}, inplace=True)

print(dataframes["Tehran"])

     date_gregorian  temp_max (°C)  temp_min (°C)  temp_mean (°C)  \
0        2011-03-21           17.4            3.9            10.8   
1        2011-03-22           18.2            6.4            11.9   
2        2011-03-23           13.3            4.6             9.6   
3        2011-03-24           12.2            1.2             7.2   
4        2011-03-25           12.4            2.7             8.0   
...             ...            ...            ...             ...   
4921     2024-09-09           33.3           21.3            27.6   
4922     2024-09-10           34.3           22.0            28.7   
4923     2024-09-11           34.6           22.7            29.0   
4924     2024-09-12           35.8           22.1            29.7   
4925     2024-09-13           33.8           22.6            28.9   

      daylight_duration (s)  precipitation_sum (mm)  rain_sum (mm)  \
0                  43745.62                     0.0            0.0   
1                  43879.68    

In [55]:
dataframes["Tehran"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_gregorian                   4926 non-null   datetime64[ns]
 1   temp_max (°C)                    4926 non-null   float64       
 2   temp_min (°C)                    4926 non-null   float64       
 3   temp_mean (°C)                   4926 non-null   float64       
 4   daylight_duration (s)            4926 non-null   float64       
 5   precipitation_sum (mm)           4926 non-null   float64       
 6   rain_sum (mm)                    4926 non-null   float64       
 7   snowfall_sum (cm)                4926 non-null   float64       
 8   precipitation_hours (h)          4926 non-null   float64       
 9   wind_speed_max (km/h)            4926 non-null   float64       
 10  wind_gusts_max (km/h)            4926 non-null   float64    

In [68]:
def calculate_seasons(month):
    if month in [1, 2, 3]:
        return 'Spring'
    elif month in [4, 5, 6]:
        return 'Summer'
    elif month in [7, 8, 9]:
        return 'Autumn'
    else:
        return 'Winter'
    
# Calculate Jalali Month
for city, df in dataframes.items():
    df["season"] = df["date_jalali"].apply(lambda date : date.month)
    df["season"] = df["season"].apply(calculate_seasons)
    df["season"] = df["season"].astype(np.str_)
    
        # If you need it as a NumPy array of strings:
    season_array = df["season"].values.astype(np.str_)
    df["season"] = season_array
    
    print(df["season"].unique())
    
print(dataframes["Tehran"].info())

['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
['Spring' 'Summer' 'Autumn' 'Winter']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_gregorian                   4926 non-null   datetime64[ns]
 1   temp_max (°C)                    4926 non-null   float64       
 2   temp_min (°C)                    4926 non-null   float64       
 3   temp_mean (°C)                   4926 non-null   float64       
 4   daylight_duration (s)            4926 non-null   float64       
 5   precipitation_sum (mm)   

In [71]:
dataframes["Tehran"]["season"] = dataframes["Tehran"]["season"].astype(np.str_)
print(dataframes["Tehran"].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_gregorian                   4926 non-null   datetime64[ns]
 1   temp_max (°C)                    4926 non-null   float64       
 2   temp_min (°C)                    4926 non-null   float64       
 3   temp_mean (°C)                   4926 non-null   float64       
 4   daylight_duration (s)            4926 non-null   float64       
 5   precipitation_sum (mm)           4926 non-null   float64       
 6   rain_sum (mm)                    4926 non-null   float64       
 7   snowfall_sum (cm)                4926 non-null   float64       
 8   precipitation_hours (h)          4926 non-null   float64       
 9   wind_speed_max (km/h)            4926 non-null   float64       
 10  wind_gusts_max (km/h)            4926 non-null   float64    