In [1]:
pip install lime

Note: you may need to restart the kernel to use updated packages.


## Import packages

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict

## Load Training Data

In [7]:
train_df = pd.read_csv('train.csv')
print('train shape: ',train_df.shape)
train_df.head()

train shape:  (1306122, 3)


Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [12]:
train_df.isna().sum()

qid              0
question_text    0
target           0
dtype: int64

In [13]:
train_df = train_df.dropna()
print(train_df.shape)

(1306122, 3)


In [14]:
# train test split
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

In [15]:
val_df.head()

Unnamed: 0,qid,question_text,target
1092525,d61b098340966d9d6501,What is the physical peak a human can achieve?,0
1024842,c8d5c445fce3c312ca5f,What is the society like in United Kingdom?,0
742735,9177e73a312fb5ff9e43,How should I deal with flight instructors who ...,0
163622,1ffecf3a38aa5062f51c,Can you take credit for subconscious acts? And...,0
1115967,daaea60735fcb5f49b12,What master programs are taught in English in ...,0


In [16]:
df_select = pd.concat([val_df[val_df['qid'] == 'd61b098340966d9d6501'],val_df[val_df['qid'] == 'c8d5c445fce3c312ca5f']])

In [17]:
df_select

Unnamed: 0,qid,question_text,target
1092525,d61b098340966d9d6501,What is the physical peak a human can achieve?,0
1024842,c8d5c445fce3c312ca5f,What is the society like in United Kingdom?,0


In [18]:
df_select.question_text

1092525    What is the physical peak a human can achieve?
1024842       What is the society like in United Kingdom?
Name: question_text, dtype: object

In [20]:
val_df.reset_index(drop=True, inplace=True)
val_df

Unnamed: 0,qid,question_text,target
0,d61b098340966d9d6501,What is the physical peak a human can achieve?,0
1,c8d5c445fce3c312ca5f,What is the society like in United Kingdom?,0
2,9177e73a312fb5ff9e43,How should I deal with flight instructors who ...,0
3,1ffecf3a38aa5062f51c,Can you take credit for subconscious acts? And...,0
4,daaea60735fcb5f49b12,What master programs are taught in English in ...,0
...,...,...,...
130608,feb0053f32eda8483c9f,How do you define a great spectacle?,0
130609,be7afe3c3888bdaf322f,When will Quora stop so many utterly stupid qu...,1
130610,b0bf236fb7298811f4f6,When did commercial jets start using autopilot?,0
130611,4ded8f8f0a7d662bc43e,How many hate comments have you received?,0


In [21]:
# create a TF-IDF vectorizer and transform the training and validation data

# vectorize to tf-idf vectors
tfidf_vc = TfidfVectorizer(min_df = 10, max_features = 100000, analyzer = "word", ngram_range = (1, 2), stop_words = 'english', lowercase = True)
train_vc = tfidf_vc.fit_transform(train_df["question_text"])
val_vc = tfidf_vc.transform(val_df["question_text"])

In [22]:
# train a logistic reg model on training data

model = LogisticRegression(C=0.5,solver='sag')
model = model.fit(train_vc, train_df.target)

# predict on validation data
val_pred = model.predict(val_vc)

In [23]:
#  evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

accuracy = accuracy_score(val_df.target, val_pred)
precision = precision_score(val_df.target, val_pred)
recall = recall_score(val_df.target, val_pred)
f1 = f1_score(val_df.target, val_pred)


In [24]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9504566926722455
Precision: 0.6948633364750235
Recall: 0.3629538461538461
F1 Score: 0.4768372544263886


In [25]:
# Display confusion matrix
conf_matrix = confusion_matrix(val_df.target, val_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[121193   1295]
 [  5176   2949]]


In [26]:
# Display classification report
# Define class names
class_names = ["sincere", "insincere"]
class_report = classification_report(val_df.target, val_pred, target_names=class_names)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

     sincere       0.96      0.99      0.97    122488
   insincere       0.69      0.36      0.48      8125

    accuracy                           0.95    130613
   macro avg       0.83      0.68      0.73    130613
weighted avg       0.94      0.95      0.94    130613

