In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('data.csv').drop('Unnamed: 0', axis=1)

#import data_prediction.csv. it's subset of the original file and will be be sufficient for testing.

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['Year'] = data.Year.astype('str')

In [None]:
data['Vict Descent'].unique()

In [None]:
#Spliting the data to data_num and data_cat

data_num = data[[x for x in data.columns if data[x].dtype != 'O']]
data_cat = data[[x for x in data.columns if data[x].dtype == 'O']]

In [None]:
#Vict Descent to number

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

vict_descent = label_encoder.fit_transform(data_cat['Vict Descent'])

In [None]:
data_cat['Vict Descent'] = vict_descent

data_cat['Vict Descent'].unique()

In [None]:
#feature engineer(mean) on data_cat

for x in data_cat.columns:
    if x != 'Vict Descent':
        dict1 = data_cat.groupby([x])['Vict Descent'].mean().to_dict()
        data_cat[x] = data_cat[x].map(dict1)
        
df = pd.concat([data_cat, data_num], axis=1)

df.drop(['Population', 'Crime Cnt', 'Crime Rate'], axis=1, inplace=True)

In [None]:
#Correlation
df.corr()['Vict Descent'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(14, 7))
sns.heatmap(df.corr(), annot=True)

In [None]:
#Select the best column use for analysis 
#Option1 

X = df.drop('Vict Descent', axis=1)
y = df.loc[:, 'Vict Descent']

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

model = SelectFromModel(Lasso(0.005))
model.fit(X, y)
model.get_support()

In [None]:
selected_feature = X.columns[model.get_support()]
X[selected_feature]

In [None]:
#Option2 

from sklearn.feature_selection import mutual_info_regression

imp = mutual_info_regression(X, y)

In [None]:
pd.DataFrame(imp, index=X.columns).sort_values(0, ascending=False)

In [None]:
#Option3

from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

orderd_rank_features = SelectKBest(score_func=chi2, k='all')

ordered_feature = orderd_rank_features.fit(X.drop(['LON'], axis=1), y)

In [None]:
ordered_feature.scores_

In [None]:
ordered_df = pd.concat(
    [
     pd.DataFrame(X.drop('LON', axis=1).columns),
     pd.DataFrame(ordered_feature.scores_, columns=['score'])
    ], axis=1
).sort_values('score', ascending=False)

selected_features = ordered_df[:8][0].values

selected_features

In [None]:
X = X[selected_features]

In [None]:
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
#Display score

def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None

In [None]:
#Creat the predict function 

def predict(ml_model):
    model = ml_model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(f'Predictions: {pred}')
    print(f'Training Score: {model.score(X_train, y_train)}')
    print('\n')
    print(f'{confusion_matrix(pred, y_test)}')
    print('\n')
    print(f'Accuracy Score: {accuracy_score(pred, y_test)}')
    print(f'Mean Squared Error: {mean_squared_error(pred, y_test)}')
    
    scores = cross_val_score(model,
               X_train,
               y_train,
#                scoring='neg_mean_squared_error',
               cv=10)
    print('\n')
    display_scores(scores)
    
    plt.figure(figsize=(4,2))
    sns.kdeplot(pred, shade=True)
    sns.kdeplot(y_test, shade=True)
    plt.legend(['pred', 'y_test'])
    
    print('\n')
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#Predictions - Tree

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

predict(tree)

In [None]:
#Predictions - forest

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

predict(forest)

In [None]:
#Predictions - KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

predict(knn)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

predict(xgb)

In [None]:
#Parameter Tunning on XGBClassifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params={
    'learning_rate':[0.05, 0.20, 0.25],
    'max_depth':[5, 8, 10],
    'min_child_weight':[1, 3, 5, 7],
    'gamma':[0.0, 0.1, 0.2, 0.4],
    'colsample_bytree':[0.3, 0.4, 0.7]
}

random_search = RandomizedSearchCV(xgb, 
                  param_distributions=params, 
                  n_iter=5,
                  scoring='roc_auc',
                  n_jobs=-1, 
                  cv=5,
                  verbose=3)

random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

In [None]:
features = pd.DataFrame(random_search.best_estimator_.feature_importances_, index=X.columns).reset_index()

sns.barplot(data=features, y='index', x=0)

In [None]:
predict(random_search.best_estimator_)

In [None]:
#Parameter Tunning on Decision Tree 

from sklearn.model_selection import GridSearchCV

params = {'criterion': ['gini', 'entropy'],
          'max_depth': [None, 10, 20, 30],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 4],
          'max_features': [None, 'sqrt', 'log2']}

grid_search  = GridSearchCV(tree,
                            params, 
                            cv=5, 
                            scoring='accuracy', 
                            n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
tree_features = pd.DataFrame(grid_search.best_estimator_.feature_importances_, index=X.columns).reset_index()

sns.barplot(data = tree_features, x=0, y='index')

In [None]:
predict(grid_search.best_estimator_)