# Advanced Data Preprocessing
## Merging DataFrames
We need to dictate how the merge takes place: is it an outer join, inner join.Mainly the same analogy as in databases is used.

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df1 = pd.DataFrame([{"id":1 , "Role": "director"}, {"id":2 , "Role": "HR"}, {"id":3 , "Role": "TA"}, {"id":4 , "Role": "professor"}])
df1 = df1.set_index("id")
df2 = pd.DataFrame([{"id": 7, "School":"Business"}, {"id": 8 , "School": "Law"},
    {"id":2 , "School": "Social Sciences"}, {"id":3 , "School": "Artificial Intelligence and Machine Learning"}])
df2 = df2.set_index("id")

outer_df = pd.merge(df1, df2, how='outer', left_index=True, right_index=True) # outer join between the two tables
print(outer_df)

In [None]:
inner_df = pd.merge(df1, df2, how='inner', left_index=True, right_index=True) # inner join between the two tables
print(inner_df)

In [None]:
left_outer = pd.merge(df1, df2, how='left', left_index=True, right_index=True) 
print(left_outer)

In [None]:
left_outer_2 = pd.merge(df2, df1, how='left', left_index=True, right_index=True)
print(left_outer_2)

Eventhough the indexes are a natural references when merging, it might not always be the case. Thus, it might be possible to merge according to certain column values using the ***on*** parameter (again as in databases)

In [None]:
print(df1, df2, sep="\n") # now the indixes are set to the default numerical series
# let's add some names and surnames to the dataframes.
df1["Name"] = ["a", "b", "c", "d"]
df2["Name"] = ["b", "e", "f", "c"]

# del(df1["id"])
# del(df2["id"])
print(df1, df2, sep="\n") 


In [None]:
df_name = pd.merge(df1, df2, on="Name", how="left")
print(df_name)
# let's consider double joining parameters
df3 = pd.DataFrame([{"first_name": "a", "last_name": "la", "Role": "HR"},
 {"first_name": "b", "last_name": "lb", "Role": "director"},{"first_name": "c", "last_name": "lc", "Role": "TA"},
 {"first_name": "d", "last_name": "ld", "Role": "professor"}])

df4 = pd.DataFrame([{"first_name": "a", "last_name": "la", "School": "Social Science"},
{"first_name": "b", "last_name": "x", "School": "Law"},
{"first_name": "c", "last_name": "lc", "School": "Engineering"}, {"first_name": "f", "last_name": "la", "School": "Law"} ])
print(pd.merge(df3, df4, on=["first_name", "last_name"]))


## Pandas idioms
A number of best practices were dictated by this framework's developers as well as users. Pieces of code following such practices are generally referred to as ***pandorable***. The rest of the code will display a number of such idioms
### Method chaining

In [None]:
# let's load the census dataset
df = pd.read_csv("utility_files/census.csv")

# print(df.head())
# let's suppose I only need data from the rows where the population estimate at 2010 is larger than 50k while having a SUMLEV == 50
# afterwards I am only interested in a set of specific columns while setting the index

final_col = ["POPESTIMATE2010"]
final_col.extend(df.columns[:7])
print(final_col)
# the pandorian way is to chain the different command as long as the intermediate results are not to be used later
better_df = df[(df["SUMLEV"] == 50) & (df["POPESTIMATE2010"] >= 50000)].loc[:, final_col].reset_index()
print(better_df.head())


In [None]:
## the apply function returns a new pd data structure (series or data frame) with the function passed as parameter applied to every row
## in the data structure
better_df.columns = [x.lower() for x in better_df.columns]
regions = 10
divisions = 15
state = 50
counties = 20
print(better_df.head())
# I will add to the final data frame a new column with the result of the function: code
def code(row) :
    return row["state"] * state + row["county"] * counties + row["region"] * regions + row["division"] * divisions
 
better_df["code"] = better_df.apply(lambda x: code(x), axis=1)
print(better_df.head())

The idioms covered here are by no means exclusive. Many of them can be explored through the famous forums such as stackoverflow as well as official documentation 

## Pandas.Groupby()


In [None]:
# the pd.groupby() method is a powerful tool as it splits the dataframe according to a splitting criteria: specified in the paramters
# and associate each group with a data frame.
df = pd.read_csv("utility_files/census.csv")
for group, frame in df.groupby("STNAME"): # split according to the state's name
    avg = np.average(frame["POPESTIMATE2010"])
    print("The average of population in the state " + str(group) + " is estimated as " + str(avg))
#  quite efficient

In [None]:
# it is not always the case that the splitting is purely based on columns' values. A function can be used to split/group the frame
# when a function is passed, the default argument considered by the function is the index unless specified otherwise.

print(len(df))
#let's divide the dataframe into batches of 200 rows
def batch_number(index):
    return index // 200

i = 1
for group, frame in df.groupby(batch_number):
    # print(group)
    print("The average population in the batch number {} in the year 2010 is estimated as {}".format(str(i), str(np.average(frame["POPESTIMATE2010"]))))
    i += 1


In [None]:
# A point that should be addressed is when there hierarchical index 
# print(df.head())
# let's assume there are multiple indices for example: STNAME and CTYNAME
# df = df.set_index(["STNAME", "CTYNAME"])

# for group, frame in df.groupby(level=(0,1)): # this tells Pandas that two rows with a different combination are indeed different
# print(group)

def grouping_differently(item):
    if re.match("[A-Ha-h]{1}.*", item[1]) is not None:
        return (item[0], "A-H city")
    elif re.match("[I-Ri-r]{1}.*", item[1]) is not None:
        return (item[0], "I-R city")
    else:
        return (item[0], "S-Z city")

for group, frame in df.groupby(grouping_differently):
    print(group)

## Aggregation
So far no complex preprocessing took place, however, more advanced aggregate functions can take place

In [None]:
# let's group by the data frame by the name of the city
def name_cat(name):
    if re.match("[A-Ha-h]{1}.*", name) is not None:
        return "A-H state"
    elif re.match("[I-Ri-r]{1}.*", name) is not None:
        return "I-R state"
    else:
        return "S-Z state"
# after resetting the index, we can group the date according the the state's name
# df = df.set_index("STNAME")

# print(df.groupby("STNAME")["POPESTIMATE2010"].agg([np.min, np.max, np.mean]))
df_states = df.groupby("STNAME").agg({"POPESTIMATE2010": [np.min, np.max], "POPESTIMATE2011": [np.mean], "POPESTIMATE2012":[np.std]}) #
# print(df_states["POPESTIMATE2010"]["amin"])
print("\n\n\n")
# print(df_states["POPESTIMATE2010"])

def custom_function(series):
    return np.sum(series)

# any functions can be passed to agg functions as long as they consider the arguements as pandas.series
functions_to_apply = [np.min, np.max, np.mean, np.std, custom_function, np.sum]
# df.reset_index()

df_batch_rank = df.groupby(batch_number).agg({"POPESTIMATE2010": functions_to_apply, "POPESTIMATE2011": functions_to_apply, "POPESTIMATE2012": functions_to_apply})
print(df_batch_rank.head())



## Pd.transform()
This method can be slightly tricky, The method accepts a function referece as an argument, then applies this function to every cell in the data frame / series.
The question arises now: what is the difference between pd.apply() and pd.transform() ?
Well the function passed to the first is applied to a row of values. As for the second, it is applied to a sole cell value.

In [None]:
# let's consider the airbnb listing dataset
df = pd.read_csv("utility_files/listings.csv")
# print(df.iloc[50: 60,:]) 
# print(df.columns)

cols = ["id", "name", "city", "state", "bathrooms", "bedrooms", "beds", "square_feet", 
"minimum_nights", "maximum_nights", "cancellation_policy", "review_scores_value"]
df = df[cols]

df_state_city_review = df.groupby(["state", "city"]).agg({"review_scores_value": np.nanmean})
print(df_state_city_review)

def fill_reviews(row):
    global df_state_city_review
    if np.isnan(row["review_scores_value"]):
        row["review_scores_value"] = df_state_city_review.loc[[row["state"],row["city"]], "review_scores_value"]
    return row


df = df.apply(fill_reviews)

## SCALES

* Ratio scale: unit are equally spaced, all mathematical operations are valid: height, weight
* Interval Scale: units are equally spaced but there is no true zero: the value zero does not mean the absence of the measured unit
it is a meaningful value itself
* Ordinal scale: the order matters and the values are not equally spaced: Letter grades: A+, A, A-...
* Nominal scale: no order with respect to one another 

The different scales are of major importance. Pandas allocated certain functionalities to work with scales

In [None]:
uni_ds = pd.read_csv("utility_files/cwurData.csv")

def level_rank(world_rank):
    if world_rank <= 100:
        return "first tier"
    elif world_rank <= 200:
        return "second tier"
    elif world_rank <= 300:
        return "third tier"
    else:
        return "other top unis"



In [None]:
uni_ds["Rank_level"] = uni_ds["world_rank"].apply(level_rank)

# print(uni_ds.head())

score_per_country_per_tier = uni_ds.pivot_table(values='score', index='country', columns='Rank_level', aggfunc=[np.mean, np.min, np.max])
# print(uni_ds["score"])
# print(uni_ds.loc[:, ["institution", "score", "country"]].iloc[:30,:])
# print(uni_ds["score"][1:20])
# print(score_per_country_per_tier.tail())
print(score_per_country_per_tier["mean"]["first tier"])
print(score_per_country_per_tier.iloc[np.argmax(score_per_country_per_tier["mean"]["first tier"])])
