import necessory data lib

In [None]:
import pandas as pd

read data into datafram

In [None]:
participants = pd.read_csv("data/participants.tsv", delimiter="\t")
groundtruth = pd.read_csv("data/groundtruth.tsv", delimiter="\t")

check unique values

In [None]:
participant_user_id = set(participants["user_id"].unique())
groundtruth_user_id = set(groundtruth["user_id"].unique())

assert len(participants) == len(participant_user_id), "Participants are not unique"
assert len(groundtruth) == len(groundtruth_user_id), "Participants are not unique"
assert participant_user_id == groundtruth_user_id, "The data is different"

In [None]:
df = groundtruth.drop("log_id", axis="columns")
df = pd.merge(participants, df, on='user_id', how= 'left')
df.to_string("report.txt")

In [None]:
df["ad_clicked"] = df["ad_clicked"].astype("int")
df["attention"] = df["attention"].astype("int")
df["attention"] = df["attention"].astype("int")
df.dtypes

In [None]:
# gender preparation
df.loc[df["gender"]=="na","gender"] = None

# Impute missing gender values with the most common gender
most_common_gender = df['gender'].mode()[0]

df['gender'] = df['gender'].fillna(most_common_gender)

In [None]:
# age value handle
df.loc[df["age"]=="na","age"] = None
age_filter_na = df.loc[df["age"] != None]

age_filter_na["age"] = age_filter_na["age"].astype("float")
age_mean = round(age_filter_na["age"].mean(),1)

df["age"] = df["age"].fillna(age_mean)
df["age"] = df["age"].astype("float")

In [None]:
# education value handle
df.loc[df["education"] == "na","education"] = None
education_filter_na = df.loc[df["education"] != None]
education_median = education_filter_na["education"].median()

df["education"] = df["education"].fillna(education_median)
df["education"] = df["education"].astype(int)

In [None]:
# income handle value
df.loc[df["income"] == "na","income"] = None
income_filter_na = df.loc[df["income"] != None]
income_median = income_filter_na["income"].median()

df["income"] = df["income"].fillna(income_median)
df["income"] = df["income"].astype("int")

In [None]:
# Apply the map to the "country" column
country_na_count = df.loc[df["country"]=="na"]
print(len(country_na_count))
df.loc[df["country"] != "USA", "country"] = "non-USA"

In [None]:
one_hot_cols = ["country", "gender", "ad_position", "ad_type", "ad_category"]

dummies = pd.get_dummies(df, columns=one_hot_cols, dummy_na= True, drop_first= True)


In [None]:
X = dummies.loc[:,["education","age","attention"]]
y = dummies["income"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [None]:
from sklearn.linear_model import LinearRegression

model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

y_predict = model_linear.predict(X_test)

# Print the intercept and coefficients of the linear regression line
print("Intercept:", model_linear.intercept_, " Coefficient:", model_linear.coef_)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

"MSE", round(mse,3), "R2 score", round(r2,3)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

y_predict = model_rf.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, precision_score, accuracy_score, recall_score,f1_score

# Compute metrics
MSE = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
CA = accuracy_score(y_test, y_predict)
# Percision = precision_score(y_test, y_predict, average="weighted")
Percision = 0
Recall = recall_score(y_test, y_predict, average="weighted")
F1 = f1_score(y_test, y_predict, average="weighted")

CA = round(CA,3)
Recall = round(Recall,3)
MSE = round(MSE,3)
r2 = round(r2,3)
Percision = round(Percision,3)
F1 = round(F1,3)

print("MSE:", MSE," R-squared:", r2, " Accuracy:", CA, " Precision:", Percision, " Recall:", Recall, " F-measure:", F1)