In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('crime.csv').drop('Unnamed: 0', axis=1)

In [None]:
data.head()

In [None]:
num_col = [col for col in data.columns if data[col].dtype != 'O']

In [None]:
cat_col = [col for col in data.columns if data[col].dtype == 'O']

In [None]:
def sex_encoding(data, col):
    blanks=[]
    for x in data[col]:
        if x == 'F':
            blanks.append(1)
        elif x == 'M':
            blanks.append(2)
        else: 
            blanks.append(0)
           
    return blanks

In [None]:
cat_df = data[cat_col]

num_df = data[num_col]

In [None]:
cat_df['Vict Sex'] = sex_encoding(data, 'Vict Sex')

In [None]:
#Feature Engineering approach 1 

In [None]:
for col in cat_df.columns:
    dict1 = cat_df.groupby([col])['Vict Sex'].mean().to_dict()
    cat_df[col] = cat_df[col].map(dict1)

In [None]:
cat_df

In [None]:
df = pd.concat([cat_df, num_df], axis=1)
df.head()

In [None]:
#Feature Engineering approach 2

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()


for col in cat_col:
    cat_df[col] = label_encoder.fit_transform(cat_df[col])
    

df2 = pd.concat([cat_df, num_df], axis=1)

df2.head()

In [None]:
#We would like to proceed with approach 1 

In [None]:
#Select the more important features / feature1 

In [None]:
X = df.drop('Vict Sex', axis=1)
y = df.loc[:, 'Vict Sex']

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
model = SelectFromModel(Lasso(alpha=0.005))

In [None]:
model.fit(X, y)

In [None]:
model.get_support()

In [None]:
X = X.loc[:, X.columns[model.get_support()]]

In [None]:
X.head()

In [None]:
#Select the more important features / feature2

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
imp = mutual_info_regression(X, y)

In [None]:
imp

In [None]:
pd.DataFrame(imp, index=X.columns).sort_values(0, ascending=False)

In [None]:
#Select the more important features / feature3

In [None]:
X = df.drop(['Vict Sex', 'LAT', 'LON'], axis=1)
y = df.loc[:, 'Vict Sex']

In [None]:
from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

In [None]:
orderd_rank_features = SelectKBest(score_func=chi2, k='all')

ordered_feature = orderd_rank_features.fit(X, y)

In [None]:
ordered_feature.scores_

In [None]:
datascores = pd.DataFrame(ordered_feature.scores_, columns=['Score'])

In [None]:
feature_rank = pd.concat([pd.DataFrame(X.columns), datascores], axis=1)\
.sort_values('Score', ascending=False)

In [None]:
feature_rank.head()

In [None]:
X = X.loc[:, ['TIME OCC', 'Vict Age', 'Vict Descent', 'Crime Category', 'Weapon Category']]

In [None]:
#We would like to proceed with feature2

In [None]:
def predict(ml_model):
    model = ml_model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(f'Training Score: {model.score(X_train, y_train)}')
    print(f'Predictions are: {pred}')
    print('\n')
    print(f'{confusion_matrix(pred, y_test)}')
    print(f'Accuracy Score: {accuracy_score(pred, y_test)}')
    print(f'Mean Squared Error: {mean_squared_error(pred, y_test)}')
    
    plt.figure()
    sns.kdeplot(pred, shade=True)
    sns.kdeplot(y_test, shade=True)
    plt.legend(['pred', 'y_test'])
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

In [None]:
def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None

In [None]:
#Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
predict(LogisticRegression())

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(log,
               X_train,
               y_train,
#                scoring='neg_mean_squared_error',
               cv=10)

In [None]:
display_scores(scores)

In [None]:
#DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
predict(DecisionTreeClassifier())

In [None]:
#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
predict(RandomForestClassifier())

In [None]:
from xgboost import XGBClassifier

In [None]:
boost = XGBClassifier()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

#grid search approach is fine when you are exploring relatively few combinations, 
#but when the hyperparameter search space is large, it is often preferable to use RandomizedSearchCV instead.

In [None]:
params={
    'learning_rate':[0.05, 0.20, 0.25],
    'max_depth':[5, 8, 10],
    'min_child_weight':[1, 3, 5, 7],
    'gamma':[0.0, 0.1, 0.2, 0.4],
    'colsample_bytree':[0.3, 0.4, 0.7]
}

random_search = RandomizedSearchCV(boost, 
                  param_distributions=params, 
                  n_iter=5,
                  scoring='roc_auc',
                  n_jobs=-1, 
                  cv=5,
                  verbose=3)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

In [None]:
predict(random_search.best_estimator_)