In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

## Preprocessing Data with Min-Max Scaling

In [9]:
df = pd.read_csv("master.csv")
df['GDP'] = df['gdp_for_year ($)'].str.replace(",", "")
df['GDP'] = df['GDP'].astype(int)
df = df.drop("HDI for year", 1)
df = df.drop("gdp_for_year ($)", 1)
df['GDP_per_capita'] = df['gdp_per_capita ($)']
df = df.drop("gdp_per_capita ($)", 1)
data_df = df[['year','sex','age','GDP', 'GDP_per_capita', 'generation']]

In [10]:
min_max_scaler = preprocessing.MinMaxScaler()
X = data_df
Y = df['suicides/100k pop']
X = min_max_scaler.fit_transform(X)

In [13]:
print(X)

[[0.  0.2 0.6]
 [0.  0.6 0.2]
 [1.  0.2 0.6]
 ...
 [0.  0.  1. ]
 [1.  0.  1. ]
 [1.  0.8 0.4]]


## Gradient Boosting  Regression Tree

In [13]:
LRmodel = LinearRegression()
GBmodel = GradientBoostingRegressor()
LRscore = cross_val_score(LRmodel, X, Y, cv=5, scoring='r2').mean()
GBscore = cross_val_score(GBmodel, X, Y, cv=5, scoring='r2').mean()

In [14]:
print(LRscore, GBscore)

0.2488158890689375 0.3118119142327881


In [23]:
parameters = {
    'learning_rate': [0.1, 0.3, 0.6, 0.9],
    'n_estimators': [50, 100, 150],
    'max_depth': [3,4,5],
}

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size= 0.8)
gs_model = GridSearchCV(GradientBoostingRegressor(), parameters, scoring='r2')
gs_model.fit(X_train, Y_train)
test_score = gs_model.score(X_test, Y_test)

In [25]:
print(test_score)

0.6607802506839299


In [26]:
gs_model.best_score_, gs_model.best_params_

(0.6413954656691987,
 {'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 150})

## Support Vector Machine

In [20]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [21]:
SVRmodel = SVR()
SVRscore = cross_val_score(SVRmodel, X, Y, cv=5, scoring='r2').mean()
print(SVRscore)

0.29353593323710403


In [22]:
parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [1, 2, 3],
    'C': [1, 2,3],
    'shrinking': [True, False],
}

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size= 0.8)
gs_model = GridSearchCV(SVR(), parameters, scoring='r2')
gs_model.fit(X_train, Y_train)
test_score = gs_model.score(X_test, Y_test)

In [38]:
train_score = gs_model.score(X_train, Y_train)

In [39]:
print(test_score, train_score)

0.6607802506839299 0.7636672916775298


In [25]:
gs_model.best_score_, gs_model.best_params_

(0.30046370943831285,
 {'C': 3, 'degree': 1, 'kernel': 'rbf', 'shrinking': False})

## KNN

In [27]:
from sklearn.neighbors import KNeighborsRegressor
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size= 0.8)
knn = KNeighborsRegressor()
param_grid = {'n_neighbors':list(range(1, 31)), 'weights': ['uniform', 'distance']}
knn_grid = GridSearchCV(knn, param_grid , cv=10)
knn_grid.fit(X_train, Y_train)
acc_train_knn = knn_grid.score(X_train, Y_train) 
acc_test_knn = knn_grid.score(X_test, Y_test)

In [28]:
print(acc_train_knn, acc_test_knn)

0.4153739608039003 0.39530057773900973


## Random Forest

In [36]:
from sklearn.ensemble import RandomForestRegressor
parameters = {
    'min_samples_split': [2, 3, 4],
    'n_estimators': [50, 100, 150],
    'min_samples_leaf': [1, 2, 3],
}

forest = RandomForestRegressor()
forest_grid = GridSearchCV(forest, parameters , scoring="r2", cv=5)
forest_grid.fit(X_train, Y_train)
acc_train_forest = forest_grid.score(X_train, Y_train)
acc_test_forest = forest_grid.score(X_test, Y_test)
print(acc_train_forest, acc_test_forest)

0.956510245495714 0.7180535591991948


In [37]:
forest_grid.best_score_, forest_grid.best_params_

(0.6563396348903822,
 {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150})