In [1]:
from collections import Counter
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../../data/sea/dataset_all_merged.csv")
df_1 = df.drop('title', axis=1)
df_1 = df_1.drop('abstract', axis=1)
df = df_1.drop('claim', axis=1)

x = df.loc[:, df.columns!='valid_patent'].astype(str)
y = df['valid_patent'].astype(int)

## Train/Test Split

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50)

## SMOTE

In [4]:
print('Original dataset shape {}'.format(Counter(y_train)))
sm = SMOTE(random_state=42)
x_res , y_res = sm.fit_sample(x_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))
print('Y test dataset shpae {}'.format(Counter(y_test)))

Original dataset shape Counter({0: 1311, 1: 22})
Resampled dataset shape Counter({0: 1311, 1: 1311})
Y test dataset shpae Counter({0: 565, 1: 7})


## Model

### Logistic Regression

In [5]:
from sklearn import linear_model, metrics
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
logreg = linear_model.LogisticRegression(fit_intercept=False)
logreg.fit(x_res, y_res)

prob = logreg.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,1] > 0.65] = 1

In [7]:
print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      0.89      0.94       565
          1       0.02      0.14      0.03         7

avg / total       0.98      0.88      0.92       572

This is accuracy score: 0.8793706293706294 



### SVM

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn import svm

svm = svm.LinearSVC()
svm.fit(x_res, y_res)

result = svm.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       1.00      0.00      0.01       565
          1       0.01      1.00      0.02         7

avg / total       0.99      0.02      0.01       572

This is accuracy score: 0.015734265734265736 



In [9]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svc = CalibratedClassifierCV(base_estimator=LinearSVC(penalty='l2', dual=False), cv=5)
svc.fit(x_res, y_res)


result = svc.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      0.85      0.91       565
          1       0.01      0.14      0.02         7

avg / total       0.98      0.84      0.90       572

This is accuracy score: 0.8426573426573427 



In [10]:
prob = svc.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,-1] > 0.75] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      0.89      0.94       565
          1       0.02      0.14      0.03         7

avg / total       0.98      0.88      0.93       572

This is accuracy score: 0.8828671328671329 



### Decision Tree

In [11]:
from sklearn import tree

dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=5)
dt.fit(x_res, y_res)

result = dt.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      0.99      0.99       565
          1       0.36      0.57      0.44         7

avg / total       0.99      0.98      0.98       572

This is accuracy score: 0.9825174825174825 



In [12]:
prob = dt.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,-1] > 0.75] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      0.99      0.99       565
          1       0.44      0.57      0.50         7

avg / total       0.99      0.99      0.99       572

This is accuracy score: 0.986013986013986 



### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=5)
rf.fit(x_res, y_res)

result = rf.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       1.00      1.00      1.00       565
          1       0.75      0.86      0.80         7

avg / total       1.00      0.99      0.99       572

This is accuracy score: 0.9947552447552448 



In [14]:
prob = rf.predict_proba(x_test)

y_pred = np.zeros(len(x_test))
y_pred[prob[:,-1] > 0.8] = 1

print("This classification report of using threshold: \n",classification_report(y_test, y_pred))
print ("This is accuracy score:",metrics.accuracy_score(y_test, y_pred),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      1.00      1.00       565
          1       1.00      0.29      0.44         7

avg / total       0.99      0.99      0.99       572

This is accuracy score: 0.9912587412587412 



### MLP

In [15]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(learning_rate_init=0.01, activation='relu', solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, ), random_state=1)
mlp.fit(x_res, y_res)

result = mlp.predict(x_test)

print("This classification report of using threshold: \n",classification_report(y_test, result))
print ("This is accuracy score:",metrics.accuracy_score(y_test, result),"\n")

This classification report of using threshold: 
              precision    recall  f1-score   support

          0       0.99      1.00      0.99       565
          1       0.00      0.00      0.00         7

avg / total       0.98      0.99      0.98       572

This is accuracy score: 0.9877622377622378 



  'precision', 'predicted', average, warn_for)
