In [1]:
#Logistic Regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score



In [2]:
df=pd.read_csv("pandas_label.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 6 columns):
Unnamed: 0                     194 non-null int64
sentence                       194 non-null object
support_keyword                194 non-null int64
after_conclusion_keyword       194 non-null int64
previous_conclusion_keyword    194 non-null int64
support_sentence               194 non-null int64
dtypes: int64(5), object(1)
memory usage: 9.2+ KB
None


In [3]:
print(df["support_sentence"].value_counts())

0    138
1     56
Name: support_sentence, dtype: int64


In [4]:
# target/input split
y = df["support_sentence"]
X = df.drop(["Unnamed: 0", "sentence", "support_sentence"], axis=1)

In [5]:
# setting random state
rs = 10

X_mat = X.as_matrix()
x_train2, x_test2, y_train2, y_test2 = train_test_split(X_mat, y, test_size=0.2, stratify=y, random_state=rs)

smt=SMOTE(random_state=20);
X_train2, Y_train2=smt.fit_sample(x_train2,y_train2);

  after removing the cwd from sys.path.


In [6]:
print (pd.Series(Y_train2).value_counts())
print (pd.Series(y_test2).value_counts())

1    110
0    110
dtype: int64
0    28
1    11
Name: support_sentence, dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler

# initialise a standard scaler object
scaler = StandardScaler()

# visualise min, max, mean and standard dev of data before scaling
print("Before scaling\n-------------")
for i in range(3):
    col = X_train2[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".
          format(i, min(col), max(col), np.mean(col), np.std(col)))

# learn the mean and std.dev of variables from training data
# then use the learned values to transform training data
X_train2 = scaler.fit_transform(X_train2, Y_train2)

print("After scaling\n-------------")
for i in range(3):
    col = X_train2[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".
          format(i, min(col), max(col), np.mean(col), np.std(col)))

x_test2 = scaler.transform(x_test2)

Before scaling
-------------
Variable #0: min 0, max 1, mean 0.25 and std dev 0.44
Variable #1: min 0, max 2, mean 0.09 and std dev 0.30
Variable #2: min 0, max 2, mean 0.09 and std dev 0.30
After scaling
-------------
Variable #0: min -0.5843487097907777, max 1.711306935815849, mean 0.00 and std dev 1.00
Variable #1: min -0.3001501125938318, max 6.303152364470468, mean -0.00 and std dev 1.00
Variable #2: min -0.30015011259383223, max 6.303152364470477, mean -0.00 and std dev 1.00


In [8]:
from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression(random_state=rs)

#cv_average_score=cross_val_score(model, X_train, y_train, cv=10)
# fit it to training data

modelLR.fit(X_train2, Y_train2)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=10, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
# training and test accuracy
print("Train accuracy:", modelLR.score(X_train2, Y_train2))
print("Test accuracy:", modelLR.score(x_test2, y_test2))
#print("Crossvalidation accuracy",np.mean(cv_average_score))
# classification report on test data
y_pred2 = modelLR.predict(x_test2)
print(classification_report(y_test2, y_pred2))

Train accuracy: 0.7863636363636364
Test accuracy: 0.8205128205128205
              precision    recall  f1-score   support

           0       0.89      0.86      0.87        28
           1       0.67      0.73      0.70        11

    accuracy                           0.82        39
   macro avg       0.78      0.79      0.78        39
weighted avg       0.83      0.82      0.82        39



In [10]:
print(y_test2.values)
print(y_pred2)

[1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0]
[1 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
 1 0]


In [14]:
feature_names=X.columns
coef=modelLR.coef_[0]

indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

indices=indices[:20]

for i in indices:
    print(feature_names[i], ":", coef[i])

support_keyword : 1.7943796812255421
after_conclusion_keyword : 0.566373130328095
previous_conclusion_keyword : 0.3007514307115342


In [15]:
# grid search
from sklearn.model_selection import GridSearchCV
params2 = {'C': [pow(10, x) for x in range(-6, 10)]}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params2, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train2, Y_train2)

# print parameters of the best model
print(cv.best_params_)

{'C': 1e-06}




In [13]:
# test the best model
print("Train accuracy:", cv.score(X_train2, Y_train2))
print("Test accuracy:", cv.score(x_test2, y_test2))

y_pred2 = cv.predict(x_test2)
print(classification_report(y_test2, y_pred2))

Train accuracy: 0.7772727272727272
Test accuracy: 0.8717948717948718
              precision    recall  f1-score   support

           0       0.90      0.93      0.91        28
           1       0.80      0.73      0.76        11

    accuracy                           0.87        39
   macro avg       0.85      0.83      0.84        39
weighted avg       0.87      0.87      0.87        39



In [24]:
print(modelLR.coef_)

[[1.47555353 1.03045801 0.33802345]]
