In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('data.csv').drop(['Unnamed: 0', 'Population', 'Crime Cnt', 'Crime Rate'], axis=1)

#import data_prediction.csv. it's subset of the original file and will be be sufficient for testing.

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['Year'] = data['Year'].astype(str)

In [None]:
data['Vict Sex'].unique()

In [None]:
#Spliting data into data_num and data_cat

data_cat = data[[x for x in data.columns if data[x].dtype == 'O']]
data_num = data[[x for x in data.columns if data[x].dtype != 'O']]

In [None]:
#Label encoder the Vict Sex

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data_cat['Vict Sex'] = label_encoder.fit_transform(data_cat['Vict Sex'])

data_cat['Vict Sex'].unique()

In [None]:
#Feature Engineering to calculate the mean 

for x in data_cat.columns:
    if x != 'Vict Sex':
        dict1=data_cat.groupby([x])['Vict Sex'].mean().to_dict()
        data_cat[x] = data_cat[x].map(dict1)
        
df = pd.concat([data_cat, data_num], axis=1)

In [None]:
#Correlation

df.corr()['Vict Sex'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(17, 7))
sns.heatmap(df.corr(), annot=True)

In [None]:
#Select the best column use for analysis 


X = df.drop('Vict Sex', axis=1)
y = df.loc[:, 'Vict Sex']

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

model = SelectFromModel(Lasso(0.005))
model.fit(X, y)
model.get_support()

In [None]:
X[X.columns[model.get_support()]]

In [None]:
from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

orderd_rank_features = SelectKBest(score_func=chi2, k='all')

ordered_feature = orderd_rank_features.fit(X.drop(['LON'], axis=1), y)

In [None]:
ordered_feature.scores_

In [None]:
features = pd.concat([pd.DataFrame(X.drop('LON', axis=1).columns), 
            pd.DataFrame(ordered_feature.scores_, columns=['score']),
          ], axis=1).sort_values('score', ascending=False)
features 

In [None]:
X = X[features[0][:8].tolist()]

In [None]:
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
#Display score

def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None


#Creat the predict function 

def predict(ml_model):
    model = ml_model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(f'Predictions: {pred}')
    print(f'Training Score: {model.score(X_train, y_train)}')
    print('\n')
    print(f'{confusion_matrix(pred, y_test)}')
    print('\n')
    print(f'Accuracy Score: {accuracy_score(pred, y_test)}')
    print(f'Mean Squared Error: {mean_squared_error(pred, y_test)}')
    
    scores = cross_val_score(model,
               X_train,
               y_train,
#                scoring='neg_mean_squared_error',
               cv=10)
    print('\n')
    display_scores(scores)
    
    plt.figure(figsize=(4,2))
    sns.kdeplot(pred, shade=True)
    sns.kdeplot(y_test, shade=True)
    plt.legend(['pred', 'y_test'])
    
    print('\n')
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#Predictions - Tree

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

predict(tree)

In [None]:
#Predictions - forest

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

predict(forest)

In [None]:
#Predictions - KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

predict(knn)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

predict(xgb)