<a href="https://colab.research.google.com/github/arohanajit/hacked-or-not/blob/master/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!wget https://raw.githubusercontent.com/arohanajit/novartis-ds-challenge/master/Train.csv

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns


In [0]:
def offense_analysis(df2,target):
    week_dict = {}
    month_dict = {}
    year_dict = {'1991-1995':0,'1996-2000':0,'2001-2005':0,'2006-2010':0,
    '2011-2015':0,'2016-2020':0}
    lst1 = df2['WEEK'].unique()
    lst2 = df2['MONTH'].unique()
    lst3 = df2['YEAR'].unique()
    for i in range(4):
        week_dict[i] = 0
    for i in range(12):
        month_dict[i] = 0
    for i in df2.index:
        if target[i] == 1:
            week_dict[df2['WEEK'].iloc[i]]+=1
            month_dict[df2['MONTH'].iloc[i]]+=1
            yr =  df2['YEAR'].iloc[i]
            if yr<=1995:
                year_dict['1991-1995']+=1
            elif yr>=1996 and yr<=2000:
                year_dict['1996-2000']+=1
            elif yr>=2001 and yr<=2005:
                year_dict['2001-2005']+=1
            elif yr>=2006 and yr<=2010:
                year_dict['2006-2010']+=1
            elif yr>=2011 and yr<=2015:
                year_dict['2011-2015']+=1
            elif yr>=2016 and yr<=2020:
                year_dict['2016-2020']+=1
    sns.set(rc={'figure.figsize':(16.7,8.27)})
    fig, axs = plt.subplots(ncols=3)
    a = sns.barplot(x=list(week_dict.keys()),y=list(week_dict.values()),ax=axs[0])
    a.set(xlabel="Week", ylabel = "No. of Crimes")
    b = sns.barplot(x=list(month_dict.keys()),y=list(month_dict.values()),ax=axs[1])
    b.set(xlabel="Month", ylabel = "No. of Crimes")
    c = sns.barplot(x=list(year_dict.keys()),y=list(year_dict.values()),ax=axs[2])
    c.set(xlabel="Year", ylabel = "No. of Crimes")
    c.set_xticklabels(c.get_xticklabels(),rotation=30)
    plt.show()


In [0]:
def preprocessing(df,data_type='Train'):
    df['X_12'].fillna(df['X_12'].mode()[0], inplace=True)
    incident_id = df.iloc[:,0]
    ids = df.columns[0]
    df = df.drop(columns = ids)
    months_list = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
    week,month,year_list = [],[],[]
    for i in df.index:
        date = int(df['DATE'].iloc[i][:2])
        month.append(months_list.index(df['DATE'].iloc[i][3:6]))
        year = df['DATE'].iloc[i][7:9]
        if int(year) <= 20:
            year_list.append(int('20'+year))
        else:
            year_list.append(int('19'+year))
        if date<=7:
            week.append(0)
        elif date>7 and date<=14:
            week.append(1)
        elif date>14 and date<=21:
            week.append(2)
        else:
            week.append(3)
    print(len(week),len(month),len(year_list))
    df.insert(0,'WEEK',week)
    df.insert(1,'MONTH',month)
    df.insert(2,'YEAR',year_list)
    df = df.drop(columns='DATE')
    offense_analysis(df,list(df['MULTIPLE_OFFENSE']))
    df = pd.get_dummies(df,columns=['WEEK','MONTH'])
    return df

In [0]:
df = pd.read_csv('Train.csv')
print(df.shape)
df = preprocessing(df)
class_values = [df.loc[df['MULTIPLE_OFFENSE'] == 0].shape[0],df.loc[df['MULTIPLE_OFFENSE'] == 1].shape[0]]
sns.barplot(x=[0,1],y=class_values)
print(class_values)
df.head()

In [0]:
neg_df = df.loc[df['MULTIPLE_OFFENSE'] == 0]
pos_df = df.loc[df['MULTIPLE_OFFENSE'] == 1].sample(n=1068,random_state=2)
print(neg_df.shape,pos_df.shape)
normalized_df = pd.concat([neg_df, pos_df]).sample(frac=1).reset_index(drop=True)
Y = normalized_df['MULTIPLE_OFFENSE']
X = normalized_df.drop(columns=['MULTIPLE_OFFENSE'])
X.head()

In [0]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=2)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [0]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [0]:
x_train = torch.tensor(x_train).float()
x_test = torch.tensor(x_test).float()
y_train = torch.tensor(y_train.values).float()
y_test = torch.tensor(y_test.values).float()
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [0]:
train_data = TensorDataset(x_train,y_train)
test_data = TensorDataset(x_test,y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

In [0]:
data_iter = iter(train_loader)
inputs,labels = next(data_iter)
print(inputs[0],labels[0])

In [0]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.hc1 = nn.Linear(x_train.shape[1], 128)
        self.hc2 = nn.Linear(128, 128)
        self.hc3 = nn.Linear(128, 64)
        self.oc = nn.Linear(64, 1)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        x = F.relu(self.hc1(x))
        x = self.dropout(x)
        x = F.relu(self.hc2(x))
        x = self.dropout(x)
        x = F.relu(self.hc3(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.oc(x))
        return x

In [0]:
net = Model()
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters())
net

In [0]:
def Graphing(loss):
    plt.plot(loss)
    plt.show()

In [0]:
def eval(model,dataloader):
    model.load_state_dict(torch.load('checkpt.pt'))
    model.eval()
    total_accuracy = 0
    total = 0
    for inputs,labels in dataloader:
        total+=1
        out = net(inputs)
        out_labels = torch.FloatTensor([1 if i>0.5 else 0 for i in out])
        total_accuracy+=accuracy_score(labels,out_labels)
    return total_accuracy/total

In [0]:
num_epochs = 100
loss = []
min_loss = 999
print_every = 10
for epoch in tqdm_notebook(range(num_epochs)):
    for data in train_loader:
        net.train()
        inputs,labels = data
        optimizer.zero_grad()
        output = torch.flatten(net(inputs),start_dim=0)
        l = criterion(output,labels)
        l.backward()
        optimizer.step()
    loss.append(l.item())
    if l.item() < min_loss:
        min_loss = l.item()
        print('New minimum detected. Saving parameters...')
        torch.save(net.state_dict(), 'checkpt.pt')
    if (epoch+1)%10 == 0:
        print("Epoch: {} Train Accuracy: {} Validation Accuracy: {}".format(epoch,eval(net,train_loader),eval(net,test_loader)))
Graphing(loss)
eval(net,test_loader)

In [0]:
def predict(data):
    data = np.asarray(data)
    print(data.shape)
    data = torch.tensor(data).float()
    out = torch.flatten(net(data),start_dim=0)
    out_labels = torch.FloatTensor([1 if i>0.5 else 0 for i in out])
    print(out.shape)
    return out_labels

In [0]:
from sklearn.metrics import recall_score
a = predict(x_test).detach().numpy()
recall_score(np.asarray(y_test),x)

In [0]:
!wget https://raw.githubusercontent.com/arohanajit/novartis-ds-challenge/master/Test.csv

In [0]:
df = pd.read_csv('Test.csv')
print(df.shape)
ids = list(df['INCIDENT_ID'])
df.head()

In [0]:
X,Y = preprocessing(df,data_type='Test')
a2 = list(predict(X).detach().numpy())

In [0]:
res = pd.DataFrame(list(zip(ids, a2)), 
               columns =['INCIDENT_ID', 'MULTIPLE_OFFENSE']) 
res.to_csv('submission.csv')