In [None]:
import random
import warnings
from random import seed

import matplotlib.pyplot as plt
import missingno as ms
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels as sm

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

%matplotlib inline

seed(42)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("float_format", "{:f}".format)
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("Train (1).csv").drop("ID", axis=1)
test = pd.read_csv("Test.csv").drop("ID", axis=1)
ss = pd.read_csv("SampleSubmission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
ss.head()

In [None]:
train.shape, test.shape

In [None]:
ms.matrix(train)

In [None]:
train.info()

In [None]:
plt.figure(figsize=(19, 8))
sns.heatmap(train.corr(), annot=True)

From the above graph we can see that some columns are not correlated at all for example **ageto** and **agefrom** but we are not going to drop them now because they can carry a signal. You can think of it as missing values **NaN** which can sometime be a signal and can be traited as a column for the model to learn from.
Further I will drop them to see if it will change anything later on.

In [None]:
sns.histplot(train.target)

In [None]:
train.corr()["target"]

In [None]:
train.agefrom.value_counts().plot(kind="bar")

In [None]:
train.ageto.value_counts().plot(kind="bar")

In [None]:
train.sex.value_counts().plot(kind="bar")

In [None]:
train.loc[
    :, "percentage_primary_schooling":"percentage_secondary_schooling_completed"
].describe().T.style.bar(subset=["mean"], color="#206ff2").background_gradient(
    subset=["std"], cmap="Reds"
).background_gradient(
    subset=["50%"], cmap="coolwarm"
)

In [None]:
features = train.columns.values[3:12]
i = 0
sns.set_style("whitegrid")
plt.figure()
fig, ax = plt.subplots(10, 10, figsize=(18, 22))

for feature in features:
    i += 1
    plt.subplot(5, 2, i)
    sns.distplot(train[feature], hist=False, label="train")
    sns.distplot(test[feature], hist=False, label="test")
    plt.xlabel(feature, fontsize=9)
    locs, labels = plt.xticks()
    plt.tick_params(axis="x", which="major", labelsize=6, pad=-6)
    plt.tick_params(axis="y", which="major", labelsize=6)
plt.show();

In [None]:
# encoding features
train = pd.get_dummies(train, drop_first=True)

test = pd.get_dummies(test, drop_first=True)

In [None]:
train.shape, test.shape

In [None]:
train.corr()["target"].sort_values()

 From the above cell we can see that many features are not correlated or we having negative correlation.<br>
 I wanted to remove ageto because it doesn't carry any unformation but after droping int it reduce the score

In [None]:
# taking both columns names for train and test for doing features importance later on
train_col = train.columns.to_list()
test_col = test.columns.to_list()

In [None]:
X = train.drop(["target"], 1).values
y = train.target.values

In [None]:
X.shape, test.shape

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

mod = sm.OLS(y, X)  # Describe model

res = mod.fit()  # Fit model

print(res.summary())  # Summarize model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=seed_val
)

In [None]:
# import regression algorithm
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [None]:
%%time
models = {
    # "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "RF": RandomForestRegressor(),
    "GB": GradientBoostingRegressor(),
    "DTC": DecisionTreeRegressor(),
    "HGB": HistGradientBoostingRegressor(),
    "LR": LinearRegression(),
}


for model_name, model in models.items():
    print("Train {}".format(model_name))
    model.fit(X_train, y_train)
    scores = cross_val_score(
        model, X_train, y_train, cv=10, scoring=("neg_mean_absolute_error")
    )
    print(scores)
    scores_average = (sum(scores)) / (len(scores))
    print(scores_average)

    #     print("List of scores for {}: {}".format(model_name, scores))
    #     print("The mean score for {}: {}".format(model_name, scores.mean()))
    print("-------------The End------------------")

### The cross valudation shows us that Random forst is the best among all here so we are going to use that one for now.

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
pred = rf.predict(test)

In [None]:
pred.shape, ss.shape

In [None]:
col_id = ss.ID

In [None]:
sub = pd.DataFrame({"ID": col_id, "target": pred})
sub.to_csv("rf.csv", index=False)

# using kflod cross validation

In [None]:
from sklearn.model_selection import KFold

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, random_state=123, test_size=0.1
# )

In [None]:
%%time
kf = KFold(5, shuffle=True, random_state=42)

oos_y = []
oos_pred = []
fold = 0
for train, val in kf.split(X, y):
    fold += 1

    X_train = X[train]
    y_train = y[train]
    X_val = X[val]
    y_val = y[val]
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    pred = model.predict(X_val)
    oos_y.append(y_val)
    oos_pred.append(pred)
    score = mean_absolute_error(y_val, pred)
    print(f"Fold {fold} score(MAE): {score}")

I have tried different models with different preprocessing and feature engineering techniques but amongs all Random forest gives the best score