In [1]:
from collections import Counter
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../data/night/night_dataset.csv")
df_1 = df.drop('title', axis=1)
df_1 = df_1.drop('abstract', axis=1)
df = df_1.drop('claim', axis=1)

x = df.loc[:, df.columns!='valid_patent'].astype(str)
y = df['valid_patent'].astype(int)

## Train/Test Split

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50)

## SMOTE

In [5]:
print('Original dataset shape {}'.format(Counter(y_train)))
sm = SMOTE(random_state=42)
x_res , y_res = sm.fit_resample(x_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))
print('Y test dataset shpae {}'.format(Counter(y_test)))

Original dataset shape Counter({0: 773, 1: 92})
Resampled dataset shape Counter({0: 773, 1: 773})
Y test dataset shpae Counter({0: 336, 1: 35})


## Model

### Logistic Regression

In [6]:
from sklearn import linear_model, metrics
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
logreg = linear_model.LogisticRegression(fit_intercept=False)
logreg.fit(x_res, y_res)

result = logreg.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.92      0.79      0.85       336
           1       0.14      0.31      0.19        35

    accuracy                           0.75       371
   macro avg       0.53      0.55      0.52       371
weighted avg       0.84      0.75      0.79       371

This is accuracy score: 0.7493261455525606 



In [8]:
prob = logreg.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,1] > 0.35] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.95      0.24      0.38       336
           1       0.11      0.89      0.19        35

    accuracy                           0.30       371
   macro avg       0.53      0.56      0.29       371
weighted avg       0.87      0.30      0.36       371

This is accuracy score: 0.2991913746630728 



### SVM

In [9]:
from sklearn.linear_model import SGDClassifier
from sklearn import svm

svm = svm.LinearSVC()
svm.fit(x_res, y_res)

result = svm.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")



This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.96      0.26      0.40       336
           1       0.11      0.89      0.20        35

    accuracy                           0.32       371
   macro avg       0.53      0.57      0.30       371
weighted avg       0.88      0.32      0.38       371

This is accuracy score: 0.31536388140161725 



In [10]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svc = CalibratedClassifierCV(base_estimator=LinearSVC(penalty='l2', dual=False), cv=5)
svc.fit(x_res, y_res)


result = svc.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.92      0.79      0.85       336
           1       0.14      0.31      0.19        35

    accuracy                           0.75       371
   macro avg       0.53      0.55      0.52       371
weighted avg       0.84      0.75      0.79       371

This is accuracy score: 0.7493261455525606 



In [11]:
prob = svc.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,-1] > 0.75] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       336
           1       0.00      0.00      0.00        35

    accuracy                           0.91       371
   macro avg       0.45      0.50      0.48       371
weighted avg       0.82      0.91      0.86       371

This is accuracy score: 0.9056603773584906 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [12]:
from sklearn import tree

dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=5)
dt.fit(x_res, y_res)

result = dt.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       336
           1       0.32      0.20      0.25        35

    accuracy                           0.88       371
   macro avg       0.62      0.58      0.59       371
weighted avg       0.86      0.88      0.87       371

This is accuracy score: 0.8840970350404312 



In [13]:
prob = dt.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,-1] > 0.75] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       336
           1       0.29      0.17      0.21        35

    accuracy                           0.88       371
   macro avg       0.60      0.56      0.58       371
weighted avg       0.86      0.88      0.87       371

This is accuracy score: 0.8814016172506739 



### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=5,random_state=42)
rf.fit(x_res, y_res)

result = rf.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       336
           1       0.33      0.20      0.25        35

    accuracy                           0.89       371
   macro avg       0.63      0.58      0.59       371
weighted avg       0.86      0.89      0.87       371

This is accuracy score: 0.8867924528301887 



In [40]:
prob = rf.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,-1] > 0.6] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       336
           1       0.00      0.00      0.00        35

    accuracy                           0.91       371
   macro avg       0.45      0.50      0.48       371
weighted avg       0.82      0.91      0.86       371

This is accuracy score: 0.9056603773584906 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(learning_rate_init=0.01, activation='relu', solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, ), random_state=1)
mlp.fit(x_res, y_res)

result = mlp.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


This classification report of using threshold: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       336
           1       0.00      0.00      0.00        35

    accuracy                           0.91       371
   macro avg       0.45      0.50      0.48       371
weighted avg       0.82      0.91      0.86       371

This is accuracy score: 0.9056603773584906 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
