# Importing Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from persiantools.jdatetime import JalaliDate
from datetime import datetime, date

# Data Preparation

## Dataset Combining 

#### load geography information

In [4]:
geography = pd.read_csv(f"Dataset/Geography Information.csv")
print(geography)

    latitude  longitude  elevation            city
0  35.676624  51.428570     1183.0        Tehran  
1  38.066784  46.150738     1399.0        Tabriz  
2  36.309315  59.648438      986.0       Mashhad  
3  32.653778  51.695120     1579.0       Isfahan  
4  29.630930  52.560837     1541.0        Shiraz  
5  31.318100  48.700836       23.0         Ahvaz  
6  37.293495  49.535810        4.0         Rasht  
7  30.263618  57.119434     1762.0        Kerman  
8  27.170473  56.224940        9.0  Bandar Abbas  
9  35.817223  50.980644     1341.0           Karaj


#### load city information

In [5]:
# City name lists
cities = [
    "Ahvaz",
    "Bandar Abbas",
    "Isfahan",
    "Karaj",
    "Kerman",
    "Mashhad",
    "Rasht",
    "Shiraz",
    "Tabriz",
    "Tehran"
]

# list to keep dataframes
dataframes = []

# Reading DataFrames and Saving them in a dictionary
for city in cities:
    df = pd.read_csv(f"Dataset/{city}.csv")
    df['city'] = city
    dataframes.append(df)

#### Combine what we have gotten so far

In [124]:
## Data Cleaning 
df_combined = pd.concat(dataframes, ignore_index=True)
df_combined = pd.merge(df_combined, geography, on='city', how='left')

# Save the combined dataframe to a new CSV file
df_combined.to_csv("Complete Dataset.csv", index=False)

## Data Cleaning 

In [6]:
df = pd.read_csv("Complete Dataset.csv")

### Checking NaNs 

In [10]:
print("Dataset Missing Values:")
missing_values = df.isnull().sum()
print(missing_values)

Dataset Missing Values:
time                               0
temp_max (°C)                      0
temp_min (°C)                      0
temp_mean (°C)                     0
daylight_duration (s)              0
precipitation_sum (mm)             0
rain_sum (mm)                      0
snowfall_sum (cm)                  0
precipitation_hours (h)            0
wind_speed_max (km/h)              0
wind_gusts_max (km/h)              0
wind_direction_dominant (°)        0
shortwave_radiation_sum (MJ/m²)    0
evapotranspiration (mm)            0
city                               0
latitude                           0
longitude                          0
elevation                          0
dtype: int64


### Setting Date Column's Datatype 

In [7]:
# Fixing the date data type
df = pd.read_csv("Complete Dataset.csv")
df["time"] = pd.to_datetime(df["time"])

### Check Datatypes 

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49260 entries, 0 to 49259
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   time                             49260 non-null  datetime64[ns]
 1   temp_max (°C)                    49260 non-null  float64       
 2   temp_min (°C)                    49260 non-null  float64       
 3   temp_mean (°C)                   49260 non-null  float64       
 4   daylight_duration (s)            49260 non-null  float64       
 5   precipitation_sum (mm)           49260 non-null  float64       
 6   rain_sum (mm)                    49260 non-null  float64       
 7   snowfall_sum (cm)                49260 non-null  float64       
 8   precipitation_hours (h)          49260 non-null  float64       
 9   wind_speed_max (km/h)            49260 non-null  float64       
 10  wind_gusts_max (km/h)            49260 non-null  float64  

### Gregorian to Jalali Convertion 

In [104]:
for city, df in dataframes.items():
    df["date_jalali"] = df["time"].apply(lambda time: JalaliDate(time))  # create Jalali date column
    df.rename(columns={'time': 'date_gregorian'}, inplace=True)  # rename time to date_gregorian

#### Calculating Seasons for analysis based on Jalali date

In [106]:
df["season"] = df["date_jalali"].apply(lambda date: 
                                        1 if date.month in [1, 2, 3] else 
                                        2 if date.month in [4, 5, 6] else 
                                        3 if date.month in [7, 8, 9] else 
                                        4  
                                        ).astype(np.int8)

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_gregorian                   4926 non-null   datetime64[ns]
 1   temp_max (°C)                    4926 non-null   float64       
 2   temp_min (°C)                    4926 non-null   float64       
 3   temp_mean (°C)                   4926 non-null   float64       
 4   daylight_duration (s)            4926 non-null   float64       
 5   precipitation_sum (mm)           4926 non-null   float64       
 6   rain_sum (mm)                    4926 non-null   float64       
 7   snowfall_sum (cm)                4926 non-null   float64       
 8   precipitation_hours (h)          4926 non-null   float64       
 9   wind_speed_max (km/h)            4926 non-null   float64       
 10  wind_gusts_max (km/h)            4926 non-null   float64    