In [119]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer
import pickle, re, spacy
# from sklearn.model_selection import train_test_split
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import cross_val_score

In [120]:
# load data
from datasets import load_dataset
dataset = load_dataset("pranjali97/Bias-detection-combined")
# train, test = train_test_split(dataset, test_size=0.2, random_state=42)

# 0 is liberal; 1 is conservative

# dataset["train"]["text"]

In [151]:
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

# Preprocessing functions
# add space before punctuations
def add_space_before(text):
    # regular expression to add space before punctuations
    processed_text = re.sub(r'([^\s\w])', r' \1', text)
    return processed_text

# remove gendered pronounds, names, stop words, and apply stemming
def removeUnnecessaryWords(text):
    doc = nlp(text)

    result = " ".join([
        "" if (
            token.text.lower() in STOP_WORDS
        ) else stemmer.stem(token.lemma_) for token in doc])

    return result.strip()

In [131]:
type(dataset["validation"]["text"][:38200])

list

In [138]:
for text in dataset["validation"]["text"][:6]:
    print(type(text), text)

<class 'str'> good aside from my family issues . my sister disappeared years ago . found out she's in a cult
<class 'str'>  If that was my missus horse he’d fuck right off and have us chasing him like bell ends
<class 'str'> that's nice , i love zebras , do you
<class 'str'>  We have nothing to lose in dumping rump asap. He is a menace.
<class 'str'> hi , read any good books lately ? i just finished hadean james latest .
<class 'str'>   OH FUCK-


Preprocessing

In [156]:
for text in dataset["train"]["text"][:38200]:
    text = removeUnnecessaryWords(text).lower()
    

In [157]:
for text in dataset["validation"]["text"][:38200]:
    text = removeUnnecessaryWords(text).lower()

In [155]:
for text in dataset["validation"]["text"][:6]:
    print(text)

good aside from my family issues . my sister disappeared years ago . found out she's in a cult
 If that was my missus horse he’d fuck right off and have us chasing him like bell ends
that's nice , i love zebras , do you
 We have nothing to lose in dumping rump asap. He is a menace.
hi , read any good books lately ? i just finished hadean james latest .
  OH FUCK-


Training

In [158]:
count_vectorizer = CountVectorizer()
train_counts = count_vectorizer.fit_transform(dataset["train"]["text"][:38200]) # 20000; use this form with Count Vectorizer
test_counts = count_vectorizer.transform(dataset["validation"]["text"][:38200]) # 20000; use this form with Count Vectorizer

tfidf_transformer = TfidfTransformer()
trainTexts = tfidf_transformer.fit_transform(train_counts)
testTexts = tfidf_transformer.fit_transform(test_counts)

X_train = pd.DataFrame(trainTexts.toarray(), columns=count_vectorizer.get_feature_names_out())
X_test = pd.DataFrame(testTexts.toarray(), columns=count_vectorizer.get_feature_names_out())
y_train = dataset["train"]["label"][:38200] # 20000
y_test = dataset["validation"]["label"][:38200] # 20000

# train model
# model = LinearSVC()
# model.fit(xTrain, yTrain)

# model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
# model.fit(xTrain, yTrain)

# with open('./savedModels/randomForestPOLITICAL.pkl', 'wb') as model_file:
#     pickle.dump((model, vectorizer), model_file)

In [159]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [160]:
# cross_val_score(rf, X_test, y_test, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
# scores = evaluate_model(rf)
# print('Score: {:.4f}'.format(scores.mean()))
modelrf = rf.fit(X_train, y_train)
# accuracy
predrf = rf.predict(X_test)

In [161]:
modelSVC = LinearSVC()
modelSVC.fit(X_train, y_train)
predSVC = modelSVC.predict(X_test)

In [162]:
modelCC = CalibratedClassifierCV(modelSVC, method='sigmoid', cv='prefit')
# modelCC = CalibratedClassifierCV(modelSVC, method='isotonic', cv='prefit')
modelCC.fit(X_train, y_train)
predCC = modelCC.predict(X_test)

In [163]:
modelSGD = SGDClassifier()
modelSGD.fit(X_train, y_train)
predSGD = modelSGD.predict(X_test)

In [None]:
# steps_cgpt = [('pca', PCA(n_components=7)), ('m', LogisticRegression())]
# modelPip = Pipeline(steps=steps_cgpt)
# modelPip.fit(X_train, y_train)
# predPip = modelPip.predict(X_test)

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import VotingClassifier
# from sklearn.metrics import f1_score
# clf1 = DecisionTreeClassifier(max_features=1, random_state=0)
# clf2 = BaggingClassifier(max_features=4, random_state=0)
# clf3 = RandomForestClassifier(max_features=1, random_state=0)
# clf4 = AdaBoostClassifier(n_estimators=50, random_state=0)
# eclf1 = VotingClassifier(estimators=[('dt', clf1), ('bdt', clf2), ('rf', clf3), ('ab', clf4)], voting='hard')
# eclf1 = eclf1.fit(X_train, y_train)
# predEoE = eclf1.predict(X_test)

Best Model

In [175]:
with open('./savedModels/politicalCCModel.pkl', 'wb') as model_file:
    pickle.dump((modelCC, count_vectorizer), model_file)

In [165]:
from nltk.stem import PorterStemmer

explainer = LimeTextExplainer(class_names=["conservative", "liberal"])
stemmer = PorterStemmer()
text= ['''Liberals know what's up''',
            '''Abortion should be illegal.''',
            '''Trump for the win''',
            '''no more student debt yessss. Thanks biden''',
            '''even if Trump is old, biden is too old''',
            '''thank god roe v wade is gone''',
            '''MAGA. MAGA. MAGA''',
            '''biden has been doing well the past couple of years''']
yActual = [1, 0, 0, 1, 0, 0, 0, 1] # 0 is conservative (similar to female); 1 is liberal (similar to male)

vectorizedText = count_vectorizer.transform(text)
vectorizedTextTFIDF = tfidf_transformer.fit_transform(vectorizedText)
textsTransformed = pd.DataFrame(vectorizedTextTFIDF.toarray(), columns=count_vectorizer.get_feature_names_out())
predProb = modelCC.predict_proba(textsTransformed)
pred = modelCC.predict(textsTransformed)

explainer = LimeTextExplainer(class_names=['Conservative', 'Liberal'])
for i in range(len(pred)):
    print(i + 1, ":")
    print("Liberal: ", predProb[i][1])
    print("Conservative: ", predProb[i][0])
    print("Predicted: ", pred[i])
    if(yActual[i] == 0):
        print("Actual: Conservative")
    else:
        print("Actual: Liberal")
        
    predict_function = lambda x: modelCC.predict_proba(count_vectorizer.transform(x))
    explanation = explainer.explain_instance(text[i], predict_function, num_features=20)
    top_words_lime = explanation.as_list()
    print(f"Top words for text response {i + 1}:")
    liberalWords = []
    conservativeWords = []
    for word, score in top_words_lime:
        if score > 0:
            liberalWords.append((word, score))
        else:
            conservativeWords.append((word, score))
    print("Liberal words: ", liberalWords)
    print("Conservative words: ", conservativeWords)
    print("")

1 :
Liberal:  0.13368954267473965
Conservative:  0.8663104573252604
Predicted:  0
Actual: Liberal
Top words for text response 1:
Liberal words:  [('know', 0.1854243561655282), ('up', 0.03098906744438674), ('s', 0.006059703262995474), ('Liberals', 0.004493564608651654)]
Conservative words:  [('what', -0.17199968926395443)]

2 :
Liberal:  0.3643466746481276
Conservative:  0.6356533253518724
Predicted:  0
Actual: Conservative
Top words for text response 2:
Liberal words:  [('Abortion', 0.3791373590489367), ('illegal', 0.35106119523434176), ('be', 0.010595042370050321)]
Conservative words:  [('should', -0.28299832903403993)]

3 :
Liberal:  0.5011648332274894
Conservative:  0.4988351667725106
Predicted:  1
Actual: Conservative




Top words for text response 3:
Liberal words:  [('Trump', 0.8219342304617554), ('the', 0.06139993015345013)]
Conservative words:  [('win', -0.054367693548818), ('for', -0.05210166315301239)]

4 :
Liberal:  0.0477841465873955
Conservative:  0.9522158534126045
Predicted:  0
Actual: Liberal
Top words for text response 4:
Liberal words:  [('biden', 0.2183659204845987), ('more', 0.10962400886698383)]
Conservative words:  [('Thanks', -0.3180817903529079), ('yessss', -0.12238362024346494), ('student', -0.05902889542268618), ('debt', -0.0443234594639752), ('no', -0.009488640614990466)]

5 :
Liberal:  0.5368392551431437
Conservative:  0.4631607448568563
Predicted:  1
Actual: Conservative




Top words for text response 5:
Liberal words:  [('old', 0.5285500210233776), ('Trump', 0.28251847367784055), ('biden', 0.1662804753531387), ('even', 0.03698862093058713)]
Conservative words:  [('is', -0.36595258488895444), ('if', -0.060595745527053366), ('too', -0.012242947914665176)]

6 :
Liberal:  0.20644924072977303
Conservative:  0.793550759270227
Predicted:  0
Actual: Conservative
Top words for text response 6:
Liberal words:  [('wade', 0.43402056564649505), ('roe', 0.3603677521935746), ('v', 0.017695404114365237), ('gone', 0.0011454516474580304)]
Conservative words:  [('is', -0.4022060483879761), ('thank', -0.2184799470454067), ('god', -0.07644073396919132)]

7 :
Liberal:  0.0350351198338589
Conservative:  0.9649648801661411
Predicted:  0
Actual: Conservative
Top words for text response 7:
Liberal words:  []
Conservative words:  [('MAGA', -0.03714058190806652)]

8 :
Liberal:  0.7228462464568526
Conservative:  0.2771537535431474
Predicted:  1
Actual: Liberal
Top words for text res



In [166]:
# get model with vectorizer
# with open("savedModels/svmModel.pkl", "rb") as model_file:
with open("savedModels/politicalCCModel.pkl", "rb") as model_file:
    model = pickle.load(model_file)
vectorizedText = count_vectorizer.transform(text)
vectorizedTextTF = tfidf_transformer.fit_transform(vectorizedText)
textsTransformed = pd.DataFrame(
    vectorizedTextTF.toarray(), columns=count_vectorizer.get_feature_names_out()
)
# predict text bias probabilities
pred = model.predict_proba(textsTransformed)
# get most influential words
predict_function = lambda x: model.predict_proba(count_vectorizer.transform(x))

for i in range(len(pred)):
    explanation = explainer.explain_instance(
        text[i], predict_function, num_features=20
    )
    top_words_lime = explanation.as_list()
    liberalWords = []
    conservativeWords = []
    for word, score in top_words_lime:
        if score > 0:
            liberalWords.append((word, round(score, 3)))
        else:
            conservativeWords.append((word, round(score, 3)))

    # make the amount of liberal and conservative words equal
    liberalWords = liberalWords[:10]
    conservativeWords = conservativeWords[:10]
    if len(liberalWords) != len(conservativeWords):
        liberalWords = liberalWords[
            : min(len(liberalWords), len(conservativeWords))
        ]
        conservativeWords = conservativeWords[
            : min(len(liberalWords), len(conservativeWords))
        ]

    # get original words from stemmed words (map)
    liberalStemmedWords = [stemmer.stem(word) for word, score in liberalWords]
    conservativeStemmedWords = [
        stemmer.stem(word) for word, score in conservativeWords
    ]
    originalLiberalWords = []
    originalConservativeWords = []
    for stemmedWord in liberalStemmedWords:
        for token in text[i].split():
            if stemmer.stem(token) == stemmedWord:
                originalLiberalWords.append(token)
                break
    for stemmedWord in conservativeStemmedWords:
        for token in text[i].split():
            if stemmer.stem(token) == stemmedWord:
                originalConservativeWords.append(token)
                break

    print("Liberal Percentage: ", pred[0][1])
    print("Conservative Percentage: ", pred[0][0])
    print("Liberal Words: ", liberalWords)
    print("Conservative Words: ", conservativeWords)
    print("Original Liberal Words: ", originalLiberalWords)
    print("Original Conservative Words: ", originalConservativeWords)



Liberal Percentage:  0.13368954267473965
Conservative Percentage:  0.8663104573252604
Liberal Words:  [('know', 0.186)]
Conservative Words:  [('what', -0.171)]
Original Liberal Words:  ['know']
Original Conservative Words:  []
Liberal Percentage:  0.13368954267473965
Conservative Percentage:  0.8663104573252604
Liberal Words:  [('Abortion', 0.379)]
Conservative Words:  [('should', -0.282)]
Original Liberal Words:  ['Abortion']
Original Conservative Words:  ['should']
Liberal Percentage:  0.13368954267473965
Conservative Percentage:  0.8663104573252604
Liberal Words:  [('Trump', 0.822), ('the', 0.061)]
Conservative Words:  [('win', -0.053), ('for', -0.051)]
Original Liberal Words:  ['Trump', 'the']
Original Conservative Words:  ['win', 'for']




Liberal Percentage:  0.13368954267473965
Conservative Percentage:  0.8663104573252604
Liberal Words:  [('biden', 0.222), ('more', 0.106), ('no', 0.0)]
Conservative Words:  [('Thanks', -0.307), ('yessss', -0.118), ('student', -0.061)]
Original Liberal Words:  ['biden', 'more', 'no']
Original Conservative Words:  ['Thanks', 'student']
Liberal Percentage:  0.13368954267473965
Conservative Percentage:  0.8663104573252604
Liberal Words:  [('old', 0.516), ('Trump', 0.281), ('biden', 0.164)]
Conservative Words:  [('is', -0.373), ('if', -0.066), ('too', -0.01)]
Original Liberal Words:  ['old', 'Trump', 'biden']
Original Conservative Words:  ['is', 'if', 'too']
Liberal Percentage:  0.13368954267473965
Conservative Percentage:  0.8663104573252604
Liberal Words:  [('wade', 0.428), ('roe', 0.356), ('v', 0.026)]
Conservative Words:  [('is', -0.411), ('thank', -0.209), ('god', -0.078)]
Original Liberal Words:  ['wade', 'roe', 'v']
Original Conservative Words:  ['is', 'thank', 'god']
Liberal Percenta



In [167]:
# accuracyPip = accuracy_score(y_test, predPip)
# accuracyEoE = accuracy_score(y_test, predEoE)

cmrf = confusion_matrix(y_test, predrf)
cmSVC = confusion_matrix(y_test, predSVC)
cmCC = confusion_matrix(y_test, predCC)
cmSGD = confusion_matrix(y_test, predSGD)
# cmPip = confusion_matrix(y_test, predPip)
# cmEoE = confusion_matrix(y_test, predEoE)

print("Random Forest\nAccuracy: {:.2f} %".format(100*accuracy_score(y_test, predrf)))
print('F1-score: {:.3f}'.format(f1_score(y_test, predrf, average='macro')))
print("Confusion matrix:\n", cmrf)
print("\nSVC\nAccuracy: {:.2f} %".format(100*accuracy_score(y_test, predSVC)))
print('F1-score: {:.3f}'.format(f1_score(y_test, predSVC, average='macro')))
print("Confusion matrix:\n", cmSVC)
print("\nCalibrated Classifier\nAccuracy: {:.2f} %".format(100*accuracy_score(y_test, predCC)))
print('F1-score: {:.3f}'.format(f1_score(y_test, predCC, average='macro')))
print("Confusion matrix:\n", cmCC)
print("\nStochastic Gradient Descent\nAccuracy: {:.2f} %".format(100*accuracy_score(y_test, predSGD)))
print('F1-score: {:.3f}'.format(f1_score(y_test, predSGD, average='macro')))
print("Confusion matrix:\n", cmSGD)
# print("\nAccuracy Pipeline:", accuracyPip)
# print("Confusion matrix Pipeline:\n", cmPip)
# print("\nAccuracy Ensemble of Ensembles:", accuracyEoE)
# print("Confusion matrix Ensemble of Ensembles:\n", cmEoE)

Random Forest
Accuracy: 76.97 %
F1-score: 0.762
Confusion matrix:
 [[2007  314]
 [ 664 1261]]

SVC
Accuracy: 77.98 %
F1-score: 0.777
Confusion matrix:
 [[1911  410]
 [ 525 1400]]

Calibrated Classifier
Accuracy: 77.91 %
F1-score: 0.776
Confusion matrix:
 [[1891  430]
 [ 508 1417]]

Stochastic Gradient Descent
Accuracy: 76.92 %
F1-score: 0.761
Confusion matrix:
 [[2026  295]
 [ 685 1240]]


In [169]:
from numpy import mean
from numpy import std

X_samples= ['''Climate change is real, and we must take urgent action to protect our planet!''',
            '''🇺🇸 Our Constitution is timeless and should be upheld without compromise. #OriginalIntent''',
            #'''Trump for the winn''', (0)
            #'''Every voice matters! Let's ensure fair voting rights for all citizens. #DemocracyFirst''', from ChatGPT (1)
            '''The Second Amendment guarantees our right to bear arms. Let's defend it fiercely!''',
            '''Family values matter. Let's prioritize strong families and community bonds.''',
            '''MAGA. MAGA. MAGA''',
            #'''Love is love! Let's celebrate diversity and fight for LGBTQ+ rights.''', from ChatGPT (1)
            '''Traditional values are the bedrock of a stable society. 🇺🇸''',
            '''Equality and justice for all!''',
            '''biden can go another term''',
            '''🇺🇸 Small government, big freedom! Let's reduce bureaucracy and empower individual liberty''',
            '''Let's prioritize renewable energy sources and combat climate change together! #GreenNewDeal''',
            '''Voting is a fundamental right. Let's make it accessible, fair, and inclusive for everyone! 🇺🇸''',
            '''Abortion should be illegal.''',
            '''After the Capitol Riot, ain't no way Trump is runnin again..''',
            '''Two old guys. Rly. I'm givin this one to biden''']

y_cgpt = [1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1] # 0 is conservative (similar to female); 1 is liberal (similar to male)
#         |  |  |  |     |  |     |  |  |           (|: from ChatGPT)
cv_cgpt = count_vectorizer.transform(X_samples)
tfidf_cgpt = tfidf_transformer.transform(cv_cgpt)
pd_cgpt = pd.DataFrame(data = tfidf_cgpt.toarray())
X_cgpt = pd_cgpt.iloc[:,:].values

predrfGPT = modelrf.predict(X_cgpt)
predSVCGPT = modelSVC.predict(X_cgpt)
predCCGPT = modelCC.predict(X_cgpt)
predSGDGPT = modelSGD.predict(X_cgpt)

# report performance
print("\nAccuracy Random Forest: {:.3f}".format(accuracy_score(y_cgpt, predrfGPT)))
print("Confusion matrix Random Forest:\n", confusion_matrix(y_cgpt, predrfGPT))
print("\nAccuracy SVC: {:.3f}".format(accuracy_score(y_cgpt, predSVCGPT)))
print("Confusion matrix SVC:\n", confusion_matrix(y_cgpt, predSVCGPT))
print("\nAccuracy Calib. Classifier: {:.3f}".format(accuracy_score(y_cgpt, predCCGPT)))
print("Confusion matrix Calib. Classifier:\n", confusion_matrix(y_cgpt, predCCGPT))
print("\nAccuracy Stochastic Gradient Descent: {:.3f}".format(accuracy_score(y_cgpt, predSGDGPT)))
print("Confusion matrix Stochastic Gradient Descent:\n", confusion_matrix(y_cgpt, predSGDGPT))


Accuracy Random Forest: 0.500
Confusion matrix Random Forest:
 [[7 0]
 [7 0]]

Accuracy SVC: 0.571
Confusion matrix SVC:
 [[7 0]
 [6 1]]

Accuracy Calib. Classifier: 0.643
Confusion matrix Calib. Classifier:
 [[7 0]
 [5 2]]

Accuracy Stochastic Gradient Descent: 0.500
Confusion matrix Stochastic Gradient Descent:
 [[7 0]
 [7 0]]


