### The main goal of this script is to perform basic profiling of the original files, assisting in exploring, cleaning and understanding the 3 data sources(3 csv files provided). In addition, standardizing the files to be processed and later normalized and loaded into the DB 

### Libraries

In [None]:
import pandas as pd
from IPython.display import Image
from pandas_profiling import ProfileReport


MUNICIPALITIES AND GEOGRAPHICAL BREAKDOWNS: 
*   Avellino Municipalities [CSV] [118x23]
*   Caserta Municipalities [CSV] [104x23]
*   Salerno Municipalities [CSV] [154x23]


### Data source descriptions

In [None]:
Image(filename="./images/avellino.png")

In [None]:
Image(filename="./images/caserta.png")

In [None]:
Image(filename="./images/salerno.png")

In [None]:
df_avellino_municipalities = pd.read_csv("./data/Avellino_municipalities.csv",index_col=False)
df_caserta_municipalities = pd.read_csv("./data/Caserta_municipalities.csv",index_col=False)
df_salerno_municipalities = pd.read_csv("./data/Salerno_municipalities.csv",index_col=False)


print("Dimensions of Avellino Municipalities : ", df_avellino_municipalities.shape)
print("Dimensions of Caserta Municipalities : ", df_caserta_municipalities.shape)
print("Dimensions of Salerno Municipalities : ", df_salerno_municipalities.shape)


### Pandas Profiling

In [None]:
profile_avellino = ProfileReport(df_avellino_municipalities, title= "Avellino")
profile_avellino


In [None]:
profile_caserta = ProfileReport(df_caserta_municipalities, title= "Caserta")
profile_caserta

In [None]:
profile_salerno = ProfileReport(df_salerno_municipalities, title= "Salerno")
profile_salerno


### Data cleaning and Running profiling again

In [None]:
df_avellino_municipalities = pd.read_csv("./data/Avellino_municipalities_c1.csv",index_col=False)
df_casesrta_municipalities = pd.read_csv("./data/Caserta_municipalities_c1.csv",index_col=False)
df_salerno_municipalities = pd.read_csv("./data/Salerno_municipalities_c1.csv",index_col=False)

In [None]:
def get_day(x):

    if not pd.isna(x):
        return int(x.split(" ")[0]) 

    else:
        return pd.NA

def get_month(x):
    
    if not pd.isna(x):
        return x.split(" ")[1].capitalize() 

    else:
        return pd.NA


#### Splitted Giorno festivo and break into day and month and standardized the date 

In [None]:


df_avellino_municipalities["Giorno festivo month"] = df_avellino_municipalities["Giorno festivo"].apply(get_month)
df_avellino_municipalities["Giorno festivo day"] = df_avellino_municipalities["Giorno festivo"].apply(get_day)
df_avellino_municipalities.drop(["Giorno festivo"], axis = 1, inplace=True)

df_casesrta_municipalities["Giorno festivo month"] = df_casesrta_municipalities["Giorno festivo"].apply(get_month)
df_casesrta_municipalities["Giorno festivo day"] = df_casesrta_municipalities["Giorno festivo"].apply(get_day)
df_casesrta_municipalities.drop(["Giorno festivo"], axis = 1, inplace=True)

df_salerno_municipalities["Giorno festivo month"] = df_salerno_municipalities["Giorno festivo"].apply(get_month)
df_salerno_municipalities["Giorno festivo day"] = df_salerno_municipalities["Giorno festivo"].apply(get_day)
df_salerno_municipalities.drop(["Giorno festivo"], axis = 1, inplace=True)

#### Converted missing values to NULLS

In [None]:

df_avellino_municipalities = df_avellino_municipalities.replace({"Informazione Assente": pd.NA,
                                                                pd.NA:pd.NA, 
                                                                "Non ha frazioni":pd.NA,
                                                                "Informazione assente": pd.NA,
                                                                })


df_casesrta_municipalities = df_casesrta_municipalities.replace({"Informazione Assente": pd.NA,
                                                                pd.NA:pd.NA, 
                                                                "Non ha frazioni":pd.NA,
                                                                "Informazione assente": pd.NA,
                                                                })

df_salerno_municipalities = df_salerno_municipalities.replace({"Informazione Assente": pd.NA,
                                                                pd.NA:pd.NA, 
                                                                "Non ha frazioni":pd.NA,
                                                                "Informazione assente": pd.NA,
                                                                })


#### Broke the GeolocalizzazioneLattitude into latitude and longitude 

In [None]:


df_avellino_municipalities["GeolocalizzazioneLattitude"] = df_avellino_municipalities["Geolocalizzazione"].apply(lambda x: float(x.split(",")[0]))
df_avellino_municipalities["GeolocalizzazioneLongitude"] = df_avellino_municipalities["Geolocalizzazione"].apply(lambda x: float(x.split(",")[1]))
df_avellino_municipalities.drop(["Geolocalizzazione"], axis = 1, inplace=True)


df_casesrta_municipalities["GeolocalizzazioneLattitude"] = df_casesrta_municipalities["Geolocalizzazione"].apply(lambda x: float(x.split(",")[0]))
df_casesrta_municipalities["GeolocalizzazioneLongitude"] = df_casesrta_municipalities["Geolocalizzazione"].apply(lambda x: float(x.split(",")[1]))
df_casesrta_municipalities.drop(["Geolocalizzazione"], axis = 1, inplace=True)


df_salerno_municipalities["GeolocalizzazioneLattitude"] = df_salerno_municipalities["Geolocalizzazione"].apply(lambda x: float(x.split(",")[0]))
df_salerno_municipalities["GeolocalizzazioneLongitude"] = df_salerno_municipalities["Geolocalizzazione"].apply(lambda x: float(x.split(",")[1]))
df_salerno_municipalities.drop(["Geolocalizzazione"], axis = 1, inplace=True)


### Profiling on the new cleaned files

In [None]:

profile_salerno_c_processed= ProfileReport(df_salerno_municipalities, title= "Salerno Cleaned")
profile_salerno_c_processed

In [None]:

profile_caserta_c_processed= ProfileReport(df_casesrta_municipalities, title= "Caserta Cleaned")
profile_caserta_c_processed

In [None]:


profile_avellino_c_processed= ProfileReport(df_avellino_municipalities, title= "Avellino Cleaned")
profile_avellino_c_processed

#### Saving the cleaned files

In [None]:
df_avellino_municipalities.to_csv("./data/Avellino_municipalities_c2.csv", index = False)
df_casesrta_municipalities.to_csv("./data/Caserta_municipalities_c2.csv", index = False)
df_salerno_municipalities.to_csv("./data/Salerno_municipalities_c2.csv", index = False)