In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

## Preprocessing Data with Min-Max Scaling

In [2]:
df = pd.read_csv("master.csv")
df['GDP'] = df['gdp_for_year ($)'].str.replace(",", "")
df['GDP'] = df['GDP'].astype(int)
df = df.drop("HDI for year", 1)
df = df.drop("gdp_for_year ($)", 1)
df['GDP_per_capita'] = df['gdp_per_capita ($)']
df = df.drop("gdp_per_capita ($)", 1)
data_df = df[['year','sex','age','GDP', 'GDP_per_capita', 'generation']]

In [3]:
min_max_scaler = preprocessing.MinMaxScaler()
X = data_df
Y = df['suicides/100k pop']
X = min_max_scaler.fit_transform(X)

In [4]:
print(X)

[[6.45161290e-02 0.00000000e+00 2.00000000e-01 1.16425365e-04
  4.32193242e-03 6.00000000e-01]
 [6.45161290e-02 0.00000000e+00 6.00000000e-01 1.16425365e-04
  4.32193242e-03 2.00000000e-01]
 [6.45161290e-02 1.00000000e+00 2.00000000e-01 1.16425365e-04
  4.32193242e-03 6.00000000e-01]
 ...
 [9.35483871e-01 0.00000000e+00 0.00000000e+00 3.47780561e-03
  1.63202512e-02 1.00000000e+00]
 [9.35483871e-01 1.00000000e+00 0.00000000e+00 3.47780561e-03
  1.63202512e-02 1.00000000e+00]
 [9.35483871e-01 1.00000000e+00 8.00000000e-01 3.47780561e-03
  1.63202512e-02 4.00000000e-01]]


## Gradient Boosting  Regression Tree

In [5]:
GBmodel = GradientBoostingRegressor()
GBscore = cross_val_score(GBmodel, X, Y, cv=5, scoring='r2').mean()
print(GBscore)

0.31174067930621796


In [6]:
parameters = {
    'learning_rate': [0.1, 0.3, 0.6, 0.9],
    'n_estimators': [50, 100, 150],
    'max_depth': [3,4,5],
}

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size= 0.8)

In [10]:
GBT_model = GridSearchCV(GradientBoostingRegressor(), parameters, scoring='r2', cv=5)
GBT_model.fit(X_train, Y_train)

In [14]:
train_score_GBT = GBT_model.score(X_train, Y_train)
test_score_GBT = GBT_model.score(X_test, Y_test)

In [15]:
print(train_score_GBT, test_score_GBT)

0.8619027966484294 0.650426034326294


In [16]:
GBT_model.best_score_, GBT_model.best_params_

(0.6423416720459013,
 {'learning_rate': 0.6, 'max_depth': 5, 'n_estimators': 150})

## Random Forest

In [17]:
parameters = {
    'min_samples_split': [2, 3, 4],
    'n_estimators': [50, 100, 150],
    'min_samples_leaf': [1, 2, 3],
}

In [None]:
forest = RandomForestRegressor()
forest_model = GridSearchCV(forest, parameters , scoring="r2", cv=5)
forest_model.fit(X_train, Y_train)

In [None]:
train_score_forest = forest_model.score(X_train, Y_train)
test_score_forest = forest_model.score(X_test, Y_test)
print(train_score_forest, test_score_forest)

In [None]:
forest_model.best_score_, forest_model.best_params_