In [None]:
import gc
import warnings
from random import seed

import matplotlib.pyplot as plt
import missingno as ms
import pandas as pd
import seaborn as sns
import statsmodels as sm

RANDOM_SEED = 42

%matplotlib inline

seed(42)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("float_format", "{:f}".format)
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("Train (1).csv").drop("ID", axis=1)
test = pd.read_csv("Test.csv").drop("ID", axis=1)
ss = pd.read_csv("SampleSubmission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
ss.head()

In [None]:
train.shape, test.shape

In [None]:
ms.matrix(train)

In [None]:
train.info()

In [None]:
plt.figure(figsize=(19, 8))
sns.heatmap(train.corr(), annot=True)

In [None]:
sns.histplot(train.target)

In [None]:
train.corr()["target"]

In [None]:
train.agefrom.value_counts().plot(kind="bar")

In [None]:
train.ageto.value_counts().plot(kind="bar")

In [None]:
train.sex.value_counts().plot(kind="bar")

In [None]:
train.loc[
    :, "percentage_primary_schooling":"percentage_secondary_schooling_completed"
].describe().T.style.bar(subset=["mean"], color="#206ff2").background_gradient(
    subset=["std"], cmap="Reds"
).background_gradient(
    subset=["50%"], cmap="coolwarm"
)

In [None]:
features = train.columns.values[3:12]
i = 0
sns.set_style("whitegrid")
plt.figure()
fig, ax = plt.subplots(10, 10, figsize=(18, 22))

for feature in features:
    i += 1
    plt.subplot(5, 2, i)
    sns.distplot(train[feature], hist=False, label="train")
    sns.distplot(test[feature], hist=False, label="test")
    plt.xlabel(feature, fontsize=9)
    locs, labels = plt.xticks()
    plt.tick_params(axis="x", which="major", labelsize=6, pad=-6)
    plt.tick_params(axis="y", which="major", labelsize=6)
plt.show();

In [None]:
# encoding features
train = pd.get_dummies(train, drop_first=True)

test = pd.get_dummies(test, drop_first=True)

In [None]:
train.shape, test.shape

In [None]:
train.corr()["target"].sort_values()

 From the above cell we can see that many features are not correlated or we having negative correlation.<br>
 I wanted to remove ageto because it doesn't carry any unformation but after droping int it reduce the score

In [None]:
# taking both columns names for train and test
train_col = train.columns.to_list()
test_col = test.columns.to_list()

In [None]:
X = train.drop(["target"], 1).values
y = train.target.values

In [None]:
X.shape, test.shape

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

mod = sm.OLS(y, X)  # Describe model

res = mod.fit()  # Fit model

print(res.summary())  # Summarize model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=42
)

In [None]:
# import regression algorithm
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [None]:
models = {
    # "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "RF": RandomForestRegressor(),
    "GB": GradientBoostingRegressor(),
    "DTC": DecisionTreeRegressor(),
    "HGB": HistGradientBoostingRegressor(),
    "LR": LinearRegression(),
}


for model_name, model in models.items():
    print("Train {}".format(model_name))
    model.fit(X_train, y_train)
    scores = cross_val_score(
        model, X_train, y_train, cv=5, scoring=("neg_mean_absolute_error")
    )
    print(scores)
    scores_average = (sum(scores)) / (len(scores))
    print(scores_average)

    #     print("List of scores for {}: {}".format(model_name, scores))
    #     print("The mean score for {}: {}".format(model_name, scores.mean()))
    print("-------------The End------------------")

### The cross valudation shows us that Random forst is the best among all here so we are going to use that one for now.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
pred = rf.predict(test)

In [None]:
pred.shape, ss.shape

In [None]:
col_id = ss.ID

In [None]:
sub = pd.DataFrame({"ID": col_id, "target": pred})
sub.to_csv("rf.csv", index=False)

### Trying other models althrough Random Forest gives the best score so far

## Lightgbm

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lgb = LGBMRegressor()
lgb.fit(X_train, y_train)

In [None]:
y_pred_lgb = lgb.predict(X_test)
score = mean_absolute_error(y_test, y_pred_lgb)
score

In [None]:
y_pred_lgb = lgb.predict(test)
sub = pd.DataFrame({"ID": ss.ID, "target": y_pred_lgb})
sub.to_csv("lgb.csv", index=False)

In [None]:
selcted_col = [
    "percentage_tertiary_schooling_completed",
    "percentage_tertiary_schooling",
    "percentage_primary_schooling",
    "percentage_primary_schooling_completed",
    "percentage_secondary_schooling_completed",
    "percentage_secondary_schooling",
    "yr_sch",
]

##  Selecting features based on the correlation

In [None]:
X_df = train[selcted_col]
X_df.head()

In [None]:
test = test[selcted_col]

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(X_df.corr(), annot=True)

In [None]:
# X_df.corr()["target"].sort_values()

In [None]:
X = X_df
y = train.target

In [None]:
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, random_state=123, test_size=0.1
)


# kfold = KFold(n_splits=10)

In [None]:
cb = CatBoostRegressor(verbose=0)
cb.fit(X_train, y_train)
pred = cb.predict(X_test)
pred[:10]

In [None]:
feature_importance_df = pd.DataFrame(cb.feature_importances_, columns=["importance"])
feature_importance_df["feature"] = X.columns

plt.figure(figsize=(17, 12))
sns.barplot(
    x="importance",
    y="feature",
    data=feature_importance_df.sort_values(by=["importance"], ascending=False).head(40),
)
plt.title("XGBoostRegressor features importance");

In [None]:
mean_absolute_error(y_test, pred)

In [None]:
pred_cb = cb.predict(test)

In [None]:
sub = pd.DataFrame({"ID": ss.ID, "target": pred_cb})
sub.to_csv("cb.csv", index=False)

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
pred_rf = rfr.predict(X_test)
mean_absolute_error(y_test, pred_rf)

In [None]:
pred_rfr = rfr.predict(test)

In [None]:
sub = pd.DataFrame({"ID": col_id, "target": pred_rfr})
sub.to_csv("rfr_added.csv", index=False)

In [None]:
feature_importance_df = pd.DataFrame(rfr.feature_importances_, columns=["importance"])
feature_importance_df["feature"] = X.columns

plt.figure(figsize=(17, 12))
sns.barplot(
    x="importance",
    y="feature",
    data=feature_importance_df.sort_values(by=["importance"], ascending=False).head(40),
)
plt.title("XGBoostRegressor features importance");

In [None]:
gb = GradientBoostingRegressor(
    n_estimators=500,
    max_depth=8,
)
gb.fit(X_train, y_train)

pred_gb = gb.predict(X_test)

In [None]:
mean_absolute_error(y_test, pred_gb)