In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import pickle
# from sklearn.model_selection import train_test_split
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import cross_val_score

In [21]:
# load data
from datasets import load_dataset
dataset = load_dataset("pranjali97/Bias-detection-combined")
# train, test = train_test_split(dataset, test_size=0.2, random_state=42)

# 0 is liberal; 1 is conservative

# dataset["train"]["text"]

In [22]:
vectorizer = CountVectorizer()
trainTexts = vectorizer.fit_transform(dataset["train"]["text"][:38200]) # 20000
testTexts = vectorizer.transform(dataset["validation"]["text"][:38200]) # 20000

X_train = pd.DataFrame(trainTexts.toarray(), columns=vectorizer.get_feature_names_out())
X_test = pd.DataFrame(testTexts.toarray(), columns=vectorizer.get_feature_names_out())
y_train = dataset["train"]["label"][:38200] # 20000
y_test = dataset["validation"]["label"][:38200] # 20000

# train model
# model = LinearSVC()
# model.fit(xTrain, yTrain)

# model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
# model.fit(xTrain, yTrain)

# with open('./savedModels/randomForestPOLITICAL.pkl', 'wb') as model_file:
#     pickle.dump((model, vectorizer), model_file)

In [16]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [17]:
# cross_val_score(rf, X_test, y_test, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
# scores = evaluate_model(rf)
# print('Score: {:.4f}'.format(scores.mean()))
modelrf = rf.fit(X_train, y_train)
# accuracy
predrf = rf.predict(X_test)

In [23]:
modelSVC = LinearSVC()
modelSVC.fit(X_train, y_train)
predSVC = modelSVC.predict(X_test)



In [24]:
modelCC = CalibratedClassifierCV(modelSVC, method='sigmoid', cv='prefit')
# modelCC = CalibratedClassifierCV(modelSVC, method='isotonic', cv='prefit')
modelCC.fit(X_train, y_train)
predCC = modelCC.predict(X_test)

In [9]:
modelSGD = SGDClassifier()
modelSGD.fit(X_train, y_train)
predSGD = modelSGD.predict(X_test)

In [19]:
# steps_cgpt = [('pca', PCA(n_components=7)), ('m', LogisticRegression())]
# modelPip = Pipeline(steps=steps_cgpt)
# modelPip.fit(X_train, y_train)
# predPip = modelPip.predict(X_test)

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import VotingClassifier
# from sklearn.metrics import f1_score
# clf1 = DecisionTreeClassifier(max_features=1, random_state=0)
# clf2 = BaggingClassifier(max_features=4, random_state=0)
# clf3 = RandomForestClassifier(max_features=1, random_state=0)
# clf4 = AdaBoostClassifier(n_estimators=50, random_state=0)
# eclf1 = VotingClassifier(estimators=[('dt', clf1), ('bdt', clf2), ('rf', clf3), ('ab', clf4)], voting='hard')
# eclf1 = eclf1.fit(X_train, y_train)
# predEoE = eclf1.predict(X_test)

Best Model

In [30]:
with open('./savedModels/politicalCCModel.pkl', 'wb') as model_file:
    pickle.dump((modelCC, vectorizer), model_file)

In [35]:
from nltk.stem import PorterStemmer

explainer = LimeTextExplainer(class_names=["conservative", "liberal"])
stemmer = PorterStemmer()
text= ['''Liberals know what's up''',
            '''Abortion should be illegal.''',
            '''Trump for the win''',
            '''no more student debt yessss. Thanks biden''',
            '''even if Trump is old, biden is too old''',
            '''thank god roe v wade is gone''',
            '''MAGA. MAGA. MAGA''',
            '''biden has been doing well the past couple of years''']
yActual = [1, 0, 0, 1, 0, 0, 0, 1] # 0 is conservative (similar to female); 1 is liberal (similar to male)

vectorizedText = vectorizer.transform(text)
textsTransformed = pd.DataFrame(vectorizedText.toarray(), columns=vectorizer.get_feature_names_out())
predProb = modelCC.predict_proba(textsTransformed)
pred = modelCC.predict(textsTransformed)

explainer = LimeTextExplainer(class_names=['Conservative', 'Liberal'])
for i in range(len(pred)):
    print(i + 1, ":")
    print("Liberal: ", predProb[i][1])
    print("Conservative: ", predProb[i][0])
    print("Predicted: ", pred[i])
    if(yActual[i] == 0):
        print("Actual: Conservative")
    else:
        print("Actual: Liberal")
        
    vectorized_text = vectorizer.transform([text[i]])
    predict_function = lambda x: modelCC.predict_proba(vectorizer.transform(x))
    explanation = explainer.explain_instance(text[i], predict_function, num_features=20)
    top_words_lime = explanation.as_list()
    print(f"Top words for text response {i + 1}:")
    liberalWords = []
    conservativeWords = []
    for word, score in top_words_lime:
        if score > 0:
            liberalWords.append((word, score))
        else:
            conservativeWords.append((word, score))
    print("Liberal words: ", liberalWords)
    print("Conservative words: ", conservativeWords)
    print("")

1 :
Liberal:  0.2171458321821546
Conservative:  0.7828541678178453
Predicted:  0
Actual: Liberal
Top words for text response 1:
Liberal words:  [('know', 0.06178258365327984), ('Liberals', 0.04457119594102481), ('s', 0.0001855333609870643)]
Conservative words:  [('what', -0.028324825670516546), ('up', -0.015267124161473442)]

2 :
Liberal:  0.12405569905638592
Conservative:  0.8759443009436141
Predicted:  0
Actual: Conservative




Top words for text response 2:
Liberal words:  [('illegal', 0.026801596999851615), ('be', 0.005592652404028004)]
Conservative words:  [('should', -0.05076371417896263), ('Abortion', -0.011994565450412829)]

3 :
Liberal:  0.23577589584118494
Conservative:  0.764224104158815
Predicted:  0
Actual: Conservative
Top words for text response 3:
Liberal words:  [('Trump', 0.1325628201320244), ('the', 0.011983751155257379)]
Conservative words:  [('win', -0.036485855641067086), ('for', -0.02883393099368923)]

4 :
Liberal:  0.047350725522956
Conservative:  0.952649274477044
Predicted:  0
Actual: Liberal




Top words for text response 4:
Liberal words:  [('debt', 0.03814176216971023), ('biden', 0.030335492173116537), ('more', 0.016434506211342598), ('no', 0.0009073048822183146)]
Conservative words:  [('Thanks', -0.09750281630195293), ('yessss', -0.07626009899251267), ('student', -0.03361284136996062)]

5 :
Liberal:  0.6961870586877865
Conservative:  0.3038129413122135
Predicted:  1
Actual: Conservative
Top words for text response 5:
Liberal words:  [('old', 0.42851345706374877), ('Trump', 0.17523627640184852), ('biden', 0.07951313932187316), ('even', 0.0430581738172989)]
Conservative words:  [('is', -0.14337228564587537), ('if', -0.05851423185319642), ('too', -0.006446528749257277)]

6 :
Liberal:  0.5200623903842099
Conservative:  0.47993760961579013
Predicted:  1
Actual: Conservative




Top words for text response 6:
Liberal words:  [('roe', 0.42762928942585116), ('wade', 0.19716041658461897), ('v', 0.0008881359202391353)]
Conservative words:  [('thank', -0.14890361440256056), ('is', -0.07217663604826363), ('god', -0.05296700563745656), ('gone', -0.013371288807439362)]

7 :
Liberal:  0.11455882656509836
Conservative:  0.8854411734349017
Predicted:  0
Actual: Conservative
Top words for text response 7:
Liberal words:  []
Conservative words:  [('MAGA', -0.014966708224694886)]

8 :
Liberal:  0.24087670583109633
Conservative:  0.7591232941689037
Predicted:  0
Actual: Liberal
Top words for text response 8:
Liberal words:  [('years', 0.08685698561826047), ('been', 0.08348576568408557), ('biden', 0.06662044017306537), ('has', 0.04820235206103565), ('well', 0.029210379523304294), ('couple', 0.028692285610274988), ('of', 0.018747146029139298), ('the', 0.010179785037913105)]
Conservative words:  [('past', -0.20471104069301754), ('doing', -0.10682224141552635)]





In [39]:
# get model with vectorizer
# with open("savedModels/svmModel.pkl", "rb") as model_file:
with open("savedModels/politicalCCModel.pkl", "rb") as model_file:
    model, vectorizer = pickle.load(model_file)
vectorizedText = vectorizer.transform(text) # CHANGED HERE
textsTransformed = pd.DataFrame(
    vectorizedText.toarray(), columns=vectorizer.get_feature_names_out()
)
# predict text bias probabilities
pred = model.predict_proba(textsTransformed)
# get most influential words
predict_function = lambda x: model.predict_proba(vectorizer.transform(x))

for i in range(len(pred)):
    explanation = explainer.explain_instance(
        text[i], predict_function, num_features=20
    )
    top_words_lime = explanation.as_list()
    liberalWords = []
    conservativeWords = []
    for word, score in top_words_lime:
        if score > 0:
            liberalWords.append((word, round(score, 3)))
        else:
            conservativeWords.append((word, round(score, 3)))

    # make the amount of liberal and conservative words equal
    liberalWords = liberalWords[:10]
    conservativeWords = conservativeWords[:10]
    if len(liberalWords) != len(conservativeWords):
        liberalWords = liberalWords[
            : min(len(liberalWords), len(conservativeWords))
        ]
        conservativeWords = conservativeWords[
            : min(len(liberalWords), len(conservativeWords))
        ]

    # get original words from stemmed words (map)
    liberalStemmedWords = [stemmer.stem(word) for word, score in liberalWords]
    conservativeStemmedWords = [
        stemmer.stem(word) for word, score in conservativeWords
    ]
    originalLiberalWords = []
    originalConservativeWords = []
    for stemmedWord in liberalStemmedWords:
        for token in text[i].split():
            if stemmer.stem(token) == stemmedWord:
                originalLiberalWords.append(token)
                break
    for stemmedWord in conservativeStemmedWords:
        for token in text[i].split():
            if stemmer.stem(token) == stemmedWord:
                originalConservativeWords.append(token)
                break

    print("Liberal Percentage: ", pred[0][1])
    print("Conservative Percentage: ", pred[0][0])
    print("Liberal Words: ", liberalWords)
    print("Conservative Words: ", conservativeWords)
    print("Original Liberal Words: ", originalLiberalWords)
    print("Original Conservative Words: ", originalConservativeWords)



Liberal Percentage:  0 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('know', 0.061), ('Liberals', 0.044)]
Conservative Words:  [('what', -0.028), ('up', -0.015)]
Original Liberal Words:  ['know', 'Liberals']
Original Conservative Words:  ['up']
Liberal Percentage:  1 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('illegal', 0.027), ('be', 0.005)]
Conservative Words:  [('should', -0.051), ('Abortion', -0.012)]
Original Liberal Words:  ['be']
Original Conservative Words:  ['should', 'Abortion']




Liberal Percentage:  2 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('Trump', 0.132), ('the', 0.012)]
Conservative Words:  [('win', -0.037), ('for', -0.029)]
Original Liberal Words:  ['Trump', 'the']
Original Conservative Words:  ['win', 'for']
Liberal Percentage:  3 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('debt', 0.038), ('biden', 0.031), ('more', 0.016)]
Conservative Words:  [('Thanks', -0.097), ('yessss', -0.077), ('student', -0.033)]
Original Liberal Words:  ['debt', 'biden', 'more']
Original Conservative Words:  ['Thanks', 'student']




Liberal Percentage:  4 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('old', 0.429), ('Trump', 0.172), ('biden', 0.079)]
Conservative Words:  [('is', -0.142), ('if', -0.057), ('too', -0.007)]
Original Liberal Words:  ['old', 'Trump', 'biden']
Original Conservative Words:  ['is', 'if', 'too']
Liberal Percentage:  5 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('roe', 0.429), ('wade', 0.197), ('v', 0.0)]
Conservative Words:  [('thank', -0.15), ('is', -0.069), ('god', -0.056)]
Original Liberal Words:  ['roe', 'wade', 'v']
Original Conservative Words:  ['thank', 'is', 'god']
Liberal Percentage:  6 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  []
Conservative Words:  []
Original Liberal Words:  []
Original Conservative Words:  []
Liberal Percentage:  7 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('years', 0.086), ('been', 0.085)]
Conservative Words:  



In [11]:
accuracyrf = accuracy_score(y_test, predrf)
accuracySVC = accuracy_score(y_test, predSVC)
accuracyCC = accuracy_score(y_test, predCC)
accuracySGD = accuracy_score(y_test, predSGD)
# accuracyPip = accuracy_score(y_test, predPip)
# accuracyEoE = accuracy_score(y_test, predEoE)

cmrf = confusion_matrix(y_test, predrf)
cmSVC = confusion_matrix(y_test, predSVC)
cmCC = confusion_matrix(y_test, predCC)
cmSGD = confusion_matrix(y_test, predSGD)
# cmPip = confusion_matrix(y_test, predPip)
# cmEoE = confusion_matrix(y_test, predEoE)

print("Accuracy Random Forest:", accuracyrf)
print("Confusion matrix Random Forest:\n", cmrf)
print("\nAccuracy SVC:", accuracySVC)
print("Confusion matrix SVC:\n", cmSVC)
print("\nAccuracy Calib. Classifier:", accuracyCC)
print("Confusion matrix Calib. Classifier:\n", cmCC)
print("\nAccuracy Stochastic Gradient Descent:", accuracySGD)
print("Confusion matrix Stochastic Gradient Descent:\n", cmSGD)
# print("\nAccuracy Pipeline:", accuracyPip)
# print("Confusion matrix Pipeline:\n", cmPip)
# print("\nAccuracy Ensemble of Ensembles:", accuracyEoE)
# print("Confusion matrix Ensemble of Ensembles:\n", cmEoE)

Accuracy Random Forest: 0.7696655675930287
Confusion matrix Random Forest:
 [[2024  297]
 [ 681 1244]]

Accuracy SVC: 0.7618935468676401
Confusion matrix SVC:
 [[1835  486]
 [ 525 1400]]

Accuracy Calib. Classifier: 0.7616580310880829
Confusion matrix Calib. Classifier:
 [[1828  493]
 [ 519 1406]]

Accuracy Stochastic Gradient Descent: 0.7633066415449835
Confusion matrix Stochastic Gradient Descent:
 [[1908  413]
 [ 592 1333]]


In [12]:
from numpy import mean
from numpy import std

X_samples= ['''Liberals know what's up''',
            '''Abortion should be illegal.''',
            '''Trump for the win''',
            '''no more student debt yessss. Thanks biden''',
            '''even if Trump is old, biden is too old''',
            '''thank god roe v wade is gone''',
            '''MAGA. MAGA. MAGA''',
            '''biden hasn't been doing too bad the past couple of years''']

y_cgpt = [1, 0, 0, 1, 0, 0, 0, 1] # 0 is conservative (similar to female); 1 is liberal (similar to male)

cv_cgpt = vectorizer.transform(X_samples)
pd_cgpt = pd.DataFrame(data = cv_cgpt.toarray())
X_cgpt = pd_cgpt.iloc[:,:].values

predrfGPT = modelrf.predict(X_cgpt)
predCCGPT = modelCC.predict(X_cgpt)
predSGDGPT = modelSGD.predict(X_cgpt)

# report performance
print("\nAccuracy Random Forest:", accuracy_score(y_cgpt, predrfGPT))
print("Confusion matrix Random Forest (picks liberal all the time):\n", confusion_matrix(y_cgpt, predrfGPT))
print("\nAccuracy Calib. Classifier:", accuracy_score(y_cgpt, predCCGPT))
print("Confusion matrix Calib. Classifier:\n", confusion_matrix(y_cgpt, predCCGPT))
print("\nAccuracy Stochastic Gradient Descent:", accuracy_score(y_cgpt, predSGDGPT))
print("Confusion matrix Stochastic Gradient Descent:\n", confusion_matrix(y_cgpt, predSGDGPT))




Accuracy Random Forest: 0.375
Confusion matrix Random Forest (picks liberal all the time):
 [[3 0]
 [5 0]]

Accuracy Calib. Classifier: 0.625
Confusion matrix Calib. Classifier:
 [[3 0]
 [3 2]]

Accuracy Stochastic Gradient Descent: 0.5
Confusion matrix Stochastic Gradient Descent:
 [[3 0]
 [4 1]]


