<a href="https://colab.research.google.com/github/aronpwong/applications-of-ml/blob/main/multi_class_xgboost_aw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,auc,log_loss,cohen_kappa_score
from warnings import filterwarnings
filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedGroupKFold

In [5]:
# Attempt to read the CSV while skipping bad lines
test_data = pd.read_csv('/content/test.csv', on_bad_lines='skip')

# Print out the first few rows to check the data
print(test_data.head())

   Id                Dates DayOfWeek PdDistrict                   Address  \
0   0  2015-05-10 23:59:00    Sunday    BAYVIEW   2000 Block of THOMAS AV   
1   1  2015-05-10 23:51:00    Sunday    BAYVIEW        3RD ST / REVERE AV   
2   2  2015-05-10 23:50:00    Sunday   NORTHERN    2000 Block of GOUGH ST   
3   3  2015-05-10 23:45:00    Sunday  INGLESIDE  4700 Block of MISSION ST   
4   4  2015-05-10 23:45:00    Sunday  INGLESIDE  4700 Block of MISSION ST   

            X          Y  
0 -122.399588  37.735051  
1 -122.391523  37.732432  
2 -122.426002  37.792212  
3 -122.437394  37.721412  
4 -122.437394  37.721412  


In [9]:
# Attempt to read the CSV while skipping bad lines
train_data = pd.read_csv('/content/train.csv', on_bad_lines='skip')

# Print out the first few rows to check the data
print(train_data.head())

                 Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X          Y  
0 -122.425892  37.774599  
1 -122.425892  37.774599  
2 -122.424363  37.800414  
3 -122.426995  37.800873  
4 -122.4387

In [10]:
train_data.shape,test_data.shape

((878049, 9), (904800, 7))

In [12]:
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [13]:
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [15]:
train_data['Category'].value_counts().sort_index()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ARSON,1513
ASSAULT,76876
BAD CHECKS,406
BRIBERY,289
BURGLARY,36755
DISORDERLY CONDUCT,4320
DRIVING UNDER THE INFLUENCE,2268
DRUG/NARCOTIC,53971
DRUNKENNESS,4280
EMBEZZLEMENT,1166


In [16]:
print(f"Number of Categories: {train_data['Category'].nunique()}")
print(f"Number of Districts: {train_data['PdDistrict'].nunique()}")

Number of Categories: 39
Number of Districts: 10


In [18]:
train_data.dtypes

Unnamed: 0,0
Dates,object
Category,object
Descript,object
DayOfWeek,object
PdDistrict,object
Resolution,object
Address,object
X,float64
Y,float64


In [19]:
train_data.isnull().sum()

Unnamed: 0,0
Dates,0
Category,0
Descript,0
DayOfWeek,0
PdDistrict,0
Resolution,0
Address,0
X,0
Y,0


In [20]:
test_data.dtypes

Unnamed: 0,0
Id,int64
Dates,object
DayOfWeek,object
PdDistrict,object
Address,object
X,float64
Y,float64


In [21]:
test_data.isnull().sum()

Unnamed: 0,0
Id,0
Dates,0
DayOfWeek,0
PdDistrict,0
Address,1
X,1
Y,2


In [22]:
train_data['Date'] = pd.to_datetime(train_data['Dates'])
test_data['Date'] = pd.to_datetime(test_data['Dates'])

In [23]:
train_data['Year'] = train_data['Date'].dt.year
test_data['Year'] = test_data['Date'].dt.year

train_data['Day'] = train_data['Date'].dt.day
test_data['Day'] = test_data['Date'].dt.day

train_data['Day_Week'] = train_data['Date'].dt.day_of_week
test_data['Day_Week'] = test_data['Date'].dt.day_of_week

train_data['Month'] = train_data['Date'].dt.month
test_data['Month'] = test_data['Date'].dt.month

train_data['Hour'] = train_data['Date'].dt.hour
test_data['Hour'] = test_data['Date'].dt.hour

train_data['Minute'] = train_data['Date'].dt.minute
test_data['Minute'] = test_data['Date'].dt.minute

In [24]:
train_data.drop(columns=['DayOfWeek','Dates','Date'],axis=1,inplace=True)
test_data.drop(columns=['DayOfWeek','Dates','Date'],axis=1,inplace=True)

In [25]:
PdDistrict_dummies_train = pd.get_dummies(train_data['PdDistrict'])

In [26]:
train_data = pd.concat([train_data.drop(columns='PdDistrict',axis=1),PdDistrict_dummies_train],axis=1)

In [28]:
train_data.head()

Unnamed: 0,Category,Descript,Resolution,Address,X,Y,Year,Day,Day_Week,Month,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,WARRANTS,WARRANT ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,13,2,5,...,False,False,False,False,True,False,False,False,False,False
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,13,2,5,...,False,False,False,False,True,False,False,False,False,False
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,13,2,5,...,False,False,False,False,True,False,False,False,False,False
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,13,2,5,...,False,False,False,False,True,False,False,False,False,False
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,13,2,5,...,False,False,False,False,False,True,False,False,False,False


In [29]:
PdDistrict_dummies_test = pd.get_dummies(test_data['PdDistrict'])

In [30]:
test_data = pd.concat([test_data.drop(columns='PdDistrict',axis=1),PdDistrict_dummies_test],axis=1)

In [31]:
test_data.head()

Unnamed: 0,Id,Address,X,Y,Year,Day,Day_Week,Month,Hour,Minute,...,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,T,TARAVAL,TENDERLOIN
0,0,2000 Block of THOMAS AV,-122.399588,37.735051,2015,10,6,5,23,59,...,False,False,False,False,False,False,False,False,False,False
1,1,3RD ST / REVERE AV,-122.391523,37.732432,2015,10,6,5,23,51,...,False,False,False,False,False,False,False,False,False,False
2,2,2000 Block of GOUGH ST,-122.426002,37.792212,2015,10,6,5,23,50,...,False,False,False,True,False,False,False,False,False,False
3,3,4700 Block of MISSION ST,-122.437394,37.721412,2015,10,6,5,23,45,...,False,True,False,False,False,False,False,False,False,False
4,4,4700 Block of MISSION ST,-122.437394,37.721412,2015,10,6,5,23,45,...,False,True,False,False,False,False,False,False,False,False


In [32]:
from sklearn.preprocessing import LabelEncoder

category_label = LabelEncoder()

In [33]:
train_data['Category_Value'] = pd.Series(category_label.fit_transform(train_data['Category']))
train_data.head()

Unnamed: 0,Category,Descript,Resolution,Address,X,Y,Year,Day,Day_Week,Month,...,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,Category_Value
0,WARRANTS,WARRANT ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,13,2,5,...,False,False,False,True,False,False,False,False,False,37
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,13,2,5,...,False,False,False,True,False,False,False,False,False,21
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,13,2,5,...,False,False,False,True,False,False,False,False,False,21
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,13,2,5,...,False,False,False,True,False,False,False,False,False,16
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,13,2,5,...,False,False,False,False,True,False,False,False,False,16


In [34]:
train_data.columns

Index(['Category', 'Descript', 'Resolution', 'Address', 'X', 'Y', 'Year',
       'Day', 'Day_Week', 'Month', 'Hour', 'Minute', 'BAYVIEW', 'CENTRAL',
       'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN',
       'TARAVAL', 'TENDERLOIN', 'Category_Value'],
      dtype='object')

In [35]:
test_data.columns

Index(['Id', 'Address', 'X', 'Y', 'Year', 'Day', 'Day_Week', 'Month', 'Hour',
       'Minute', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN',
       'PARK', 'RICHMOND', 'SOUTHERN', 'T', 'TARAVAL', 'TENDERLOIN'],
      dtype='object')

In [36]:
columns_to_keep = ['X', 'Y', 'Year', 'Day', 'Day_Week', 'Month', 'BAYVIEW',
       'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND',
       'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

In [37]:
X_train = train_data[columns_to_keep]
y_train = train_data['Category_Value']

X_test = test_data[columns_to_keep]
test_ids = test_data['Id']

In [38]:
dtrain = xgb.DMatrix(X_train,label=y_train)
dtest = xgb.DMatrix(X_test)

In [39]:
params = {
    'objective': 'multi:softmax',
    'num_class': 39,
    'max_depth': 6,
    'eta': 0.3
}

In [40]:
num_rounds = 10
xgb_model = xgb.train(params,dtrain,num_rounds)

In [41]:
y_pred = xgb_model.predict(dtest)

In [42]:
#getting raw values for prediction
y_pred_proba = xgb_model.predict(dtest,output_margin=True)

# Apply sigmoid transformation to convert to probabilities
probabilities = 1 / (1 + np.exp(-y_pred_proba))

# Normalize each row
normalized_array = probabilities / probabilities.sum(axis=1, keepdims=True)

# Display the normalized array
print(normalized_array)

[[0.0166004  0.04805369 0.01486121 ... 0.0458661  0.03848955 0.0284115 ]
 [0.01633379 0.04783809 0.01462253 ... 0.04108599 0.04441136 0.03374036]
 [0.0217636  0.04704289 0.01513356 ... 0.04505881 0.03346601 0.01756284]
 ...
 [0.01615639 0.04454982 0.02387077 ... 0.04699274 0.03732948 0.02392699]
 [0.01940769 0.04626665 0.01547539 ... 0.04703236 0.04399991 0.02935931]
 [0.01704897 0.04175104 0.01432455 ... 0.04592406 0.03207234 0.02287934]]


In [43]:
df = pd.DataFrame(normalized_array, columns=category_label.classes_)

In [44]:
df.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.0166,0.048054,0.014861,0.016146,0.035586,0.016813,0.019809,0.035166,0.018315,0.015563,...,0.014859,0.022123,0.015103,0.040451,0.014507,0.019695,0.042495,0.045866,0.03849,0.028411
1,0.016334,0.047838,0.014623,0.016116,0.035015,0.017886,0.01949,0.041875,0.018385,0.015313,...,0.01462,0.021768,0.01486,0.039801,0.014274,0.019379,0.041607,0.041086,0.044411,0.03374
2,0.021764,0.047043,0.015134,0.014846,0.046075,0.016643,0.020342,0.028784,0.02058,0.01519,...,0.014722,0.025745,0.014988,0.037254,0.014552,0.028333,0.043648,0.045059,0.033466,0.017563
3,0.016785,0.048959,0.014799,0.015632,0.033546,0.018176,0.021669,0.032179,0.019244,0.015498,...,0.014797,0.022871,0.015578,0.038259,0.014446,0.020888,0.043991,0.047251,0.03777,0.028352
4,0.016785,0.048959,0.014799,0.015632,0.033546,0.018176,0.021669,0.032179,0.019244,0.015498,...,0.014797,0.022871,0.015578,0.038259,0.014446,0.020888,0.043991,0.047251,0.03777,0.028352


In [45]:
submission = pd.concat([test_ids,df],axis=1)

In [46]:
submission.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.0166,0.048054,0.014861,0.016146,0.035586,0.016813,0.019809,0.035166,0.018315,...,0.014859,0.022123,0.015103,0.040451,0.014507,0.019695,0.042495,0.045866,0.03849,0.028411
1,1,0.016334,0.047838,0.014623,0.016116,0.035015,0.017886,0.01949,0.041875,0.018385,...,0.01462,0.021768,0.01486,0.039801,0.014274,0.019379,0.041607,0.041086,0.044411,0.03374
2,2,0.021764,0.047043,0.015134,0.014846,0.046075,0.016643,0.020342,0.028784,0.02058,...,0.014722,0.025745,0.014988,0.037254,0.014552,0.028333,0.043648,0.045059,0.033466,0.017563
3,3,0.016785,0.048959,0.014799,0.015632,0.033546,0.018176,0.021669,0.032179,0.019244,...,0.014797,0.022871,0.015578,0.038259,0.014446,0.020888,0.043991,0.047251,0.03777,0.028352
4,4,0.016785,0.048959,0.014799,0.015632,0.033546,0.018176,0.021669,0.032179,0.019244,...,0.014797,0.022871,0.015578,0.038259,0.014446,0.020888,0.043991,0.047251,0.03777,0.028352


In [47]:
submission.to_csv('submission.csv', index=False)