# Home Assignment
## Alireza Bolhassani
----
At each stage, the required libraries will be imported.

In [None]:
import pandas as pd

-----
Load the participants data and groundtruth data into separate data frames.  
The type of data is tsv and in this case you need to define "\t" as delimiter for second parameter of read_csv.

In [None]:
participants = pd.read_csv("data/participants.tsv", delimiter="\t")
groundtruth = pd.read_csv("data/groundtruth.tsv", delimiter="\t")

-----
Since the values of user_id must be unique so we can check it by get the unique value from first data frame and compare it with the second data frame. And both data frame must have the same data row (len).

In [None]:
participant_user_id = set(participants["user_id"].unique())
groundtruth_user_id = set(groundtruth["user_id"].unique())

assert len(participants) == len(participant_user_id), "Participants are not unique"
assert len(groundtruth) == len(groundtruth_user_id), "Participants are not unique"
assert participant_user_id == groundtruth_user_id, "The data is different"

------
#### Merge Data frames
Merge the two data frames using the "merge" function in pandas, using a common identifier column (e.g. "participant_id") that exists in both data frames and should result in a new data frame.

In [None]:
df = groundtruth.drop("log_id", axis="columns")
df = pd.merge(participants, df, on='user_id', how= 'left')
df.shape

-----


In [None]:
df["ad_clicked"] = df["ad_clicked"].astype("int")
df["attention"] = df["attention"].astype("int")
df["attention"] = df["attention"].astype("int")
df.dtypes

-----------
#### Gender Preparation
Impute missing gender values with the most common gender

In [None]:
df.loc[df["gender"]=="na","gender"] = None
most_common_gender = df['gender'].mode()[0]
df['gender'] = df['gender'].fillna(most_common_gender)

-----------
#### Age Value Handle

In [None]:
# age value handle
df.loc[df["age"]=="na","age"] = None
age_filter_na = df.loc[df["age"] != None]

age_filter_na["age"] = age_filter_na["age"].astype("float")
age_mean = round(age_filter_na["age"].mean(),1)

df["age"] = df["age"].fillna(age_mean)
df["age"] = df["age"].astype("float")

------
#### Education Handle Value

In [None]:
df.loc[df["education"] == "na","education"] = None
education_filter_na = df.loc[df["education"] != None]
education_median = education_filter_na["education"].median()

df["education"] = df["education"].fillna(education_median)
df["education"] = df["education"].astype(int)

-------
#### Income Handle Value

In [None]:
df.loc[df["income"] == "na","income"] = None
income_filter_na = df.loc[df["income"] != None]
income_median = income_filter_na["income"].median()

df["income"] = df["income"].fillna(income_median)
df["income"] = df["income"].astype("int")

------
#### Apply the map to the "country" column

In [None]:
country_na_count = df.loc[df["country"] == "na"]
print(len(country_na_count))
df.loc[df["country"] != "USA", "country"] = "non-USA"

In [None]:
one_hot_cols = ["country", "gender", "ad_position", "ad_type", "ad_category"]

dummies = pd.get_dummies(df, columns=one_hot_cols, dummy_na= True, drop_first= True)

In [None]:
X = dummies.loc[:,["education","age","attention"]]
y = dummies["income"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

-------
-------
## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model_linear = LinearRegression(fit_intercept= True)
model_linear.fit(X_train, y_train)

y_predict = model_linear.predict(X_test)

# Print the intercept and coefficients of the linear regression line
print("Intercept:", model_linear.intercept_, " Coefficient:", model_linear.coef_)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

"MSE", round(mse,3), "R2 score", round(r2,3)

--------
--------
## Random Forest
Train the Random Forest model using the training set. You can specify the number of trees in the forest and other hyperparameters such as the maximum depth of the trees.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
model_rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
model_rf.fit(X_train, y_train)

y_predict = model_rf.predict(X_test)

model_rf.feature_importances_

-------
Evaluate the performance of the model on the testing set using metrics such as accuracy, precision, recall, and F1 score. 

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, precision_score, accuracy_score, recall_score,f1_score

# Compute metrics
MSE = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
CA = accuracy_score(y_test, y_predict)
# Percision = precision_score(y_test, y_predict, average="weighted")
Percision = 0 #------------------------------
Recall = recall_score(y_test, y_predict, average="weighted")
F1 = f1_score(y_test, y_predict, average="weighted")

print("MSE:", round(MSE,3)," R-squared:", round(r2,3), " Accuracy:", round(CA,3))
print("Precision:", round(Percision,3), " Recall:", round(Recall,3), " F-measure:", round(F1,3))

----------
visualize the structure of an individual decision tree in a trained Random Forest model.


In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

tree = model_rf.estimators_[0]   # extract the first decision tree in the forest
plt.figure(figsize=(20,7))   # set the size of the figure
plot_tree(tree, filled=True, feature_names=X.columns)   # plot the tree and fill the nodes with colors
plt.show()   # show the plot
