In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
#import cloudpickle as pickle
import dill as pickle
import pickletools

In [2]:
import cloudpickle
cloudpickle.__version__

'1.6.0'

In [3]:
class UploadDataFrame:
    def upload_court_dataframe(self):
        try:
            return pd.read_csv(r'E:\Compressed\final_df\final_df.csv')
        except FileNotFoundError as fe:
            print("File not found:", fe)
        except pd.errors.EmptyDataError:
            print("No data")
        except pd.errors.ParserError:
            print("Parse error")
        except Exception as e:
            print("Error:", e)


data = UploadDataFrame()
df = data.upload_court_dataframe()
df = df[:2000]

In [4]:
class DataCleaningAndSplitting:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def cleaning_dataframe(self):
        try:
            self.dataframe.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'T2M', 'YEAR', 'DY', 'day', 'Date'], inplace=True)
            self.dataframe['incident_dates'] = self.dataframe['incident_dates'].notnull().astype('int')
            self.dataframe = self.dataframe.drop(columns=['Facts / Distinguishing Features'])
            self.dataframe = self.dataframe.rename(columns={'Indictment / Offence': 'offence', 'T2M_MAX': 'temp_max',
                                                            'T2M_MIN': 'temp_min'})
            x = self.dataframe.drop(columns=['incident_dates', 'index'])
            y = self.dataframe['incident_dates']
            x_trn, x_tst, y_trn, y_tst = train_test_split(x, y, test_size=0.3, random_state=1)
            return x_trn, x_tst, y_trn, y_tst

        except KeyError as ke:
            return 'KeyError at method cleaning_dataframe of class DataCleaningAndSplitting:', ke
        except ValueError as ve:
            return 'ValueError at method cleaning_dataframe of class DataCleaningAndSplitting:', ve
        except Exception as e:
            return 'Error at method cleaning_dataframe of class DataCleaningAndSplitting:', e


In [5]:
def splitting_dataframe():
    try:
        df_clean_split = DataCleaningAndSplitting(df)
        return df_clean_split.cleaning_dataframe()
    except KeyError as ke:
        return 'KeyError at function splitting_dataframe:', ke
    except ValueError as ve:
        return 'ValueError at function splitting_dataframe :', ve
    except Exception as e:
        return 'Error at function splitting_dataframe :', e



df_cleaning=DataCleaningAndSplitting(df)
x_train,x_test,y_train,y_test=splitting_dataframe()


In [6]:
class FillNan(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform (self, X, y=None):
        try:
            col_nan = X.columns[X.isnull().sum() > 0].to_list()
            for col in col_nan:
                X[col][X[col].isnull()] = X[col].dropna().sample(X[col].isnull().sum()).values
            return X
        except KeyError as ke:
            return 'KeyError at transform of class FillNan :', ke
        except ValueError as ve:
            return 'ValueError at transform of class FillNan:', ve
        except Exception as e:
            return 'Error at transform of class FillNan:', e


In [7]:
class TypeCastingCourtRoom(BaseEstimator, TransformerMixin):
    def __init__(self, court_room):
        self.court_room = court_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            X[self.court_room] = X[self.court_room].astype(str)
            return X
        except KeyError as ke:
            return 'KeyError at transform of class TypeCastingCourtRoom :', ke
        except ValueError as ve:
            return 'ValueError at transform of class TypeCastingCourtRoom:', ve
        except Exception as e:
            return 'Error at transform of class TypeCastingCourtRoom:', e

def removingoutliers(X,y):
    lower_outlier=X[X['Age'].str.split().str.get(0).astype(int)<10].index.to_list()
    upper_outlier=X[X['Age'].str.split().str.get(0).astype(int)>80].index.to_list()
    outliers = lower_outlier+upper_outlier
    X=X.drop(index=outliers)
    y=y.drop(index=outliers)
    return X,y

In [8]:
class CleaningAgeGenderSentence(BaseEstimator, TransformerMixin):
    def __init__(self, Age, Gender, Sentence):
        self.Gender = Gender
        self.Age = Age
        self.Sentence = Sentence

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            X[self.Age] = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
            X[self.Gender] = X[self.Gender].apply(lambda a: X[self.Gender].mode()[0] if a == 'No Answer' else a)
            X[self.Sentence] = X[self.Sentence].str.split().str.get(-1)
            X[self.Sentence] = X[self.Sentence].apply(lambda x: 'Non-custodial' if x == 'Non-custodialNon-custodial'else x)
            
            return pd.DataFrame(X)
        
        except KeyError as ke:
            return 'KeyError at transform of class CleaningAgeGenderSentence :', ke
        except ValueError as ve:
            return 'ValueError at transform of class CleaningAgeGenderSentence:', ve
        except Exception as e:
            return 'Error at transform of class CleaningAgeGenderSentence:', e

In [9]:
class CleaningOffence(BaseEstimator, TransformerMixin):
    lst_offence = ['Murder', 'Manslaughter', 'GBH', 'drugs', 'Rape', 'driving', 'Robbery', 'ABH', 'Sexual', 'burglary',
                   'weapon', 'wounding']

    def __init__(self, offence):
        self.offence = offence

    def fit(self, X, y=None):
        return self

    def check_word(self, text):
        try:
            for i in CleaningOffence.lst_offence:
                if i.lower() in text.split():
                    return i.lower()
        except KeyError as ke:
            return 'KeyError at check_word of class CleaningOffence :', ke
        except ValueError as ve:
            return 'ValueError at check_word of class CleaningOffence:', ve
        except Exception as e:
            return 'Error at check_word of class CleaningOffence:', e

    def transform(self, X, y=None):
        try:
            X[self.offence] = X[self.offence].apply(self.check_word)
            X[self.offence] = X[self.offence].apply(lambda x: 'other' if x == None else x.lower())
            X[self.offence] = X[self.offence].apply(lambda x: 'burglary' if x == 'Robbery' else x.lower())
            X[self.offence] = X[self.offence].apply(lambda x: 'Sexual' if x == 'Rape' else x.lower())
            X[self.offence] = X[self.offence].apply(lambda x: 'body_harm' if x == 'ABH' or x == 'wounding' else x.lower())
            X[self.offence] = X[self.offence].apply(lambda x: 'Murder' if x == 'Manslaughter' else x.lower())
            return X
        except KeyError as ke:
            return 'KeyError at transform of class CleaningOffence :', ke
        except ValueError as ve:
            return 'ValueError at transform of class CleaningOffence:', ve
        except Exception as e:
            return 'Error at transform of class CleaningOffence:', e

In [10]:
class OHE(BaseEstimator,TransformerMixin):

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        try:
            cat_cols=X.select_dtypes(include='object').columns.to_list()
            ohe=OneHotEncoder(sparse=False,handle_unknown='ignore',drop='first',dtype='int')
            ohe_df=pd.DataFrame(ohe.fit_transform(X[cat_cols]))
            ohe_df.columns = ohe.get_feature_names_out(input_features = cat_cols)
            num_cols=X.select_dtypes(exclude='object').columns.to_list()
            num_df=X[num_cols]
            return pd.concat([num_df, ohe_df.set_index(num_df.index)], axis=1)
        except KeyError as ke:
            return 'KeyError at transform of class OHE :', ke
        except ValueError as ve:
            return 'ValueError at transform of class OHE:', ve
        except Exception as e:
            return 'Error at transform of class OHE:', e

In [11]:
class PCAColumns(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        pca=PCA(n_components=0.4)
        X=pd.DataFrame(pca.fit_transform(X))
        return X

In [12]:
def random_over_sampler():
    try:
        over = RandomOverSampler(sampling_strategy=1, random_state=42)
        return over

    except KeyError as ke:
        return 'KeyError at function of random_over_sampler:', ke
    except ValueError as ve:
        return 'ValueError at function of random_over_sampler:', ve
    except Exception as e:
        return 'Error at function of random_over_sampler:', e

In [13]:
random_forest_param_grid = {
    'n_estimators': [100,200,300,400,500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'] }


random_forest_class = Pipeline(steps=[

    ('nan_transformer', FillNan()),
    ('type_cast', TypeCastingCourtRoom('court_room')),
    ('remove_outliers', FunctionSampler(func=removingoutliers, validate=False)),
    ('cleaning_columns', CleaningAgeGenderSentence('Age', 'Gender', 'Sentence')),
    ('cleaning_offence', CleaningOffence('offence')),
    ('ohe', OHE()),
    ('over', random_over_sampler()),
    ('pca',PCAColumns()),
    ('rf',RandomForestClassifier()),
    #('rf',GridSearchCV(RandomForestClassifier(),random_forest_param_grid))
])

In [14]:
decision_tree_param_grid = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

decision_tree_class= Pipeline(steps=[
    ('nan_transformer', FillNan()),
    ('type_cast', TypeCastingCourtRoom('court_room')),
    ('remove_outliers', FunctionSampler(func=removingoutliers, validate=False)),
    ('cleaning_columns', CleaningAgeGenderSentence('Age', 'Gender', 'Sentence')),
    ('cleaning_offence', CleaningOffence('offence')),
    ('ohe', OHE()),
    ('over', random_over_sampler()),
    ('pca', PCAColumns()),
    ('dt',DecisionTreeClassifier(random_state=1))
    #('dt',GridSearchCV(DecisionTreeClassifier(),decision_tree_param_grid))
     ])

In [15]:
logistic_reg_param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }

logistic_regression = Pipeline(steps=[
    ('nan_transformer', FillNan()),
    ('type_cast', TypeCastingCourtRoom('court_room')),
    ('remove_outliers', FunctionSampler(func=removingoutliers, validate=False)),
    ('cleaning_columns', CleaningAgeGenderSentence('Age', 'Gender', 'Sentence')),
    ('cleaning_offence', CleaningOffence('offence')),
    ('ohe', OHE()),
    ('over', random_over_sampler()),
    ('pca', PCAColumns()),
    ('lr', LogisticRegression(random_state=1))
    #('lr',GridSearchCV(LogisticRegression(),logistic_reg_param_grid))
     ])

In [16]:
Ada_boost_param_grid ={
    'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20, 30],
    'learning_rate': [0.01,0.1,1,10],
    'algorithm': ['SAMME', 'SAMME.R']
}
ada_boost_class = Pipeline(steps=[
    ('nan_transformer', FillNan()),
    ('type_cast', TypeCastingCourtRoom('court_room')),
    ('remove_outliers', FunctionSampler(func=removingoutliers, validate=False)),
    ('cleaning_columns', CleaningAgeGenderSentence('Age', 'Gender', 'Sentence')),
    ('cleaning_offence', CleaningOffence('offence')),
    ('ohe', OHE()),
    ('over', random_over_sampler()),
    ('pca', PCAColumns()),
    ('ada',AdaBoostClassifier(random_state=1))
    #('ada',GridSearchCV(AdaBoostClassifier(),Ada_boost_param_grid))
])


In [17]:
gradient_boosting_param_grid = {
    "n_estimators":[5,50,100,250,350,500],
    "max_depth":[1,3,5,7,9,11,13],
    "learning_rate":[0.01,0.1,1,10]
}

gradient_boosting_class = Pipeline(steps=[
    ('nan_transformer', FillNan()),
    ('type_cast', TypeCastingCourtRoom('court_room')),
    ('remove_outliers', FunctionSampler(func=removingoutliers, validate=False)),
    ('cleaning_columns', CleaningAgeGenderSentence('Age', 'Gender', 'Sentence')),
    ('cleaning_offence', CleaningOffence('offence')),
    ('ohe', OHE()),
    ('over', random_over_sampler()),
    ('pca', PCAColumns()),
    ('gb', GradientBoostingClassifier())
    ])
    #('gb',GridSearchCV(GradientBoostingClassifier(),gradient_boosting_param_grid))

In [18]:
class BestModel:
    mypipeline=[random_forest_class, gradient_boosting_class, ada_boost_class, decision_tree_class, logistic_regression]
    pipeline_dict = {0: 'Random Forest',1: 'Decision Tree', 3: 'Ada Boost', 4: 'Gradient Boosting',
                     2: 'Logistic Regression'}
    best_model = []
    accuracy = 0
    classifier = 0
    pipeline = ''
    def prediction(self):
            try:
                for pipe in BestModel.mypipeline:
                    pipe.fit(x_train, y_train)

                for i,model in enumerate(BestModel.mypipeline):
                    if model.score(x_test,y_test)>BestModel.accuracy:
                        BestModel.accuracy=model.score(x_test,y_test)
                        BestModel.pipeline=model
                        BestModel.classfier=i
                BestModel.best_model.append(BestModel.pipeline_dict[BestModel.classfier])
                #print('The Best classifier is {}'.format(BestModel.pipeline_dict[BestModel.classfier]))

                return model

            except KeyError as ke:
                return 'KeyError at class of BestModel:', ke
            except ValueError as ve:
                return 'ValueError at class of BestModel:', ve
            except Exception as e:
                return 'Error at class of BestModel:', e

In [19]:
best_model = BestModel().prediction()

In [20]:
best_model

In [21]:
best_model.predict(
pd.DataFrame([['Male', '27 Years', 'ABH Assault occasioning actual bodily harm',
                                   'Custodial immediate', 3, 15.21, 7.57,
                                   'May', 'For Trial']],
                                 columns=['Gender', 'Age', 'offence', 'Sentence', 'court_room', 'temp_max',
                                          'temp_min', 'month', 'reason'])
                )

array([0])

In [22]:
#pd.DataFrame([[data.values()]],columns=[data.keys()])

In [23]:
best_model.predict(pd.DataFrame([['Male', '27 Years', 'ABH Assault occasioning actual bodily harm',
                                   'Custodial immediate', 3, 15.21, 7.57,
                                   'May', 'For Trial']],
                                 columns=['Gender', 'Age', 'offence', 'Sentence', 'court_room', 'temp_max',
                                          'temp_min', 'month', 'reason']))

array([0])

In [24]:
import cloudpickle as pickle
import pickletools

In [25]:
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [26]:
with open('model.pkl' , 'rb') as f:
    model = pickle.load(f)

In [27]:
model

In [28]:
model.predict(
pd.DataFrame([['Male', '27 Years', 'ABH Assault occasioning actual bodily harm',
                                   'Custodial immediate', 3, 15.21, 7.57,
                                   'May', 'For Trial']],
                                 columns=['Gender', 'Age', 'offence', 'Sentence', 'court_room', 'temp_max',
                                          'temp_min', 'month', 'reason'])
)[0]

0

In [29]:
data={
    "Gender":"Male",
    "Age" : "27 Years",
    "offence":"ABH Assault occasioning actual bodily harm",
    "Sentence":"Custodial immediate",
    "court_room":3,
    "temp_max":15.21,
    "temp_min": 7.57,
    "month" : "May",
    "reason":"For Trial"   
}

In [30]:
model.predict(pd.DataFrame([data]))[0]

0

In [31]:
from platform import python_version
python_version()

'3.8.5'

In [32]:
y_train

1194    0
45      0
1477    0
1293    1
1736    0
       ..
1791    0
1096    0
1932    1
235     0
1061    1
Name: incident_dates, Length: 1400, dtype: int32

In [33]:
x_train.iloc[1293]

Gender                               Male
Age                              72 Years
offence       Indecent assault [6 counts]
Sentence              Custodial immediate
court_room                            2.0
temp_max                             9.85
temp_min                             2.73
month                                 Nov
reason                Trial (Part Heard) 
Name: 1817, dtype: object

In [34]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400 entries, 1194 to 1061
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1400 non-null   object 
 1   Age         1400 non-null   object 
 2   offence     1400 non-null   object 
 3   Sentence    1400 non-null   object 
 4   court_room  1400 non-null   object 
 5   temp_max    1400 non-null   float64
 6   temp_min    1400 non-null   float64
 7   month       1400 non-null   object 
 8   reason      1400 non-null   object 
dtypes: float64(2), object(7)
memory usage: 141.7+ KB
