In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import pickle, re
from flask import Flask, request, jsonify
from nltk.stem import PorterStemmer
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# from sklearn.model_selection import train_test_split
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import cross_val_score

In [13]:
# preprocessing functions
# # Preprocessing functions
# add space before punctuations

app = Flask(__name__)
explainer = LimeTextExplainer(class_names=['female', 'male'])
stemmer = PorterStemmer()
def add_space_before(text):
    # regular expression to add space before punctuations
    processed_text = re.sub(r'([^\s\w])', r' \1', text)
    return processed_text

# remove gendered pronounds, names, stop words, and apply stemming
def removeUnnecessaryWords(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    result = " ".join([
        "" if (
            token.pos_ == "PRON" and token.lemma_ not in ["I", "you"]
        ) or (
            token.ent_type_ == "PERSON" or token.text.lower() in ["woman", "women", "man", "men", "he", "she", "him", "her"]
        ) or (
            token.text.lower() in STOP_WORDS
        ) else stemmer.stem(token.lemma_) for token in doc])

    return result.strip()

In [8]:
from datasets import load_dataset

# load data
dataset = load_dataset("pranjali97/Bias-detection-combined")
# train, test = train_test_split(dataset, test_size=0.2, random_state=42)

# 0 is liberal; 1 is conservative

# dataset["train"]["text"]

In [20]:
vectorizer = CountVectorizer()

# preprocess training and testing data
train = [removeUnnecessaryWords(add_space_before(text)) for text in dataset["train"]["text"][:1000]]
validation = [removeUnnecessaryWords(add_space_before(text)) for text in dataset["validation"]["text"][:1000]]

trainTexts = vectorizer.fit_transform(train) # 20000
testTexts = vectorizer.transform(validation) # 20000

X_train = pd.DataFrame(trainTexts.toarray(), columns=vectorizer.get_feature_names_out())
X_test = pd.DataFrame(testTexts.toarray(), columns=vectorizer.get_feature_names_out())
y_train = dataset["train"]["label"][:1000] # 20000
y_test = dataset["validation"]["label"][:1000] # 20000

# train model
# model = LinearSVC()
# model.fit(xTrain, yTrain)

# model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
# model.fit(xTrain, yTrain)

# with open('./savedModels/randomForestPOLITICAL.pkl', 'wb') as model_file:
#     pickle.dump((model, vectorizer), model_file)

In [21]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [22]:
# cross_val_score(rf, X_test, y_test, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
# scores = evaluate_model(rf)
# print('Score: {:.4f}'.format(scores.mean()))
modelrf = rf.fit(X_train, y_train)
# accuracy
predrf = rf.predict(X_test)

In [23]:
modelSVC = LinearSVC()
modelSVC.fit(X_train, y_train)
predSVC = modelSVC.predict(X_test)



In [24]:
modelCC = CalibratedClassifierCV(modelSVC, method='sigmoid', cv='prefit')
# modelCC = CalibratedClassifierCV(modelSVC, method='isotonic', cv='prefit')
modelCC.fit(X_train, y_train)
predCC = modelCC.predict(X_test)

In [25]:
modelSGD = SGDClassifier()
modelSGD.fit(X_train, y_train)
predSGD = modelSGD.predict(X_test)

In [None]:
# steps_cgpt = [('pca', PCA(n_components=7)), ('m', LogisticRegression())]
# modelPip = Pipeline(steps=steps_cgpt)
# modelPip.fit(X_train, y_train)
# predPip = modelPip.predict(X_test)

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import VotingClassifier
# from sklearn.metrics import f1_score
# clf1 = DecisionTreeClassifier(max_features=1, random_state=0)
# clf2 = BaggingClassifier(max_features=4, random_state=0)
# clf3 = RandomForestClassifier(max_features=1, random_state=0)
# clf4 = AdaBoostClassifier(n_estimators=50, random_state=0)
# eclf1 = VotingClassifier(estimators=[('dt', clf1), ('bdt', clf2), ('rf', clf3), ('ab', clf4)], voting='hard')
# eclf1 = eclf1.fit(X_train, y_train)
# predEoE = eclf1.predict(X_test)

Best Model

In [27]:
with open('../savedModels/politicalCCModel.pkl', 'wb') as model_file:
    pickle.dump((modelCC, vectorizer), model_file)

In [28]:
from nltk.stem import PorterStemmer

explainer = LimeTextExplainer(class_names=["conservative", "liberal"])
stemmer = PorterStemmer()
text= ['''Liberals know what's up''',
            '''Abortion should be illegal.''',
            '''Trump for the win''',
            '''no more student debt yessss. Thanks biden''',
            '''even if Trump is old, biden is too old''',
            '''thank god roe v wade is gone''',
            '''MAGA. MAGA. MAGA''',
            '''biden has been doing well the past couple of years''']
yActual = [1, 0, 0, 1, 0, 0, 0, 1] # 0 is conservative (similar to female); 1 is liberal (similar to male)

vectorizedText = vectorizer.transform(text)
textsTransformed = pd.DataFrame(vectorizedText.toarray(), columns=vectorizer.get_feature_names_out())
predProb = modelCC.predict_proba(textsTransformed)
pred = modelCC.predict(textsTransformed)

explainer = LimeTextExplainer(class_names=['Conservative', 'Liberal'])
for i in range(len(pred)):
    print(i + 1, ":")
    print("Liberal: ", predProb[i][1])
    print("Conservative: ", predProb[i][0])
    print("Predicted: ", pred[i])
    if(yActual[i] == 0):
        print("Actual: Conservative")
    else:
        print("Actual: Liberal")
        
    vectorized_text = vectorizer.transform([text[i]])
    predict_function = lambda x: modelCC.predict_proba(vectorizer.transform(x))
    explanation = explainer.explain_instance(text[i], predict_function, num_features=20)
    top_words_lime = explanation.as_list()
    print(f"Top words for text response {i + 1}:")
    liberalWords = []
    conservativeWords = []
    for word, score in top_words_lime:
        if score > 0:
            liberalWords.append((word, score))
        else:
            conservativeWords.append((word, score))
    print("Liberal words: ", liberalWords)
    print("Conservative words: ", conservativeWords)
    print("")

1 :
Liberal:  0.003097892889321571
Conservative:  0.9969021071106784
Predicted:  0
Actual: Liberal
Top words for text response 1:
Liberal words:  []
Conservative words:  [('know', -0.023476484574767774), ('up', -0.0020203011199220714), ('Liberals', -0.00018541017251410736), ('s', -0.00017224570386588316), ('what', -0.00014488239923638515)]

2 :
Liberal:  0.029904043787915835
Conservative:  0.9700959562120841
Predicted:  0
Actual: Conservative
Top words for text response 2:
Liberal words:  []
Conservative words:  [('Abortion', 0.0), ('should', 0.0), ('be', 0.0), ('illegal', 0.0)]

3 :
Liberal:  0.9998989016119424
Conservative:  0.00010109838805760774
Predicted:  1
Actual: Conservative




Top words for text response 3:
Liberal words:  [('Trump', 0.3981783789573515), ('win', 0.331170643296357), ('for', 0.13118368558689628), ('the', 0.08024210116376682)]
Conservative words:  []

4 :
Liberal:  0.021231170229045995
Conservative:  0.978768829770954
Predicted:  0
Actual: Liberal
Top words for text response 4:
Liberal words:  [('student', 0.008728634593342513), ('no', 0.00020832478472099233), ('yessss', 0.00020676948944734372), ('Thanks', 0.00015807950810481003), ('more', 7.539705217440952e-05), ('biden', 7.438009320146678e-05)]
Conservative words:  [('debt', -0.019359631668459883)]

5 :
Liberal:  0.9998705226849852
Conservative:  0.000129477315014781
Predicted:  1
Actual: Conservative




Top words for text response 5:
Liberal words:  [('old', 0.36709725451584335), ('Trump', 0.3451253179613628), ('even', 0.024974084346353266), ('too', 0.0205572294213355), ('is', 0.016919477438620573), ('biden', 0.012896931458934481)]
Conservative words:  [('if', -0.004952965985852626)]

6 :
Liberal:  0.0009643868324754551
Conservative:  0.9990356131675245
Predicted:  0
Actual: Conservative
Top words for text response 6:
Liberal words:  []
Conservative words:  [('thank', -0.0120110943177875), ('god', -0.010576489500987928), ('wade', -0.0006819497129112941), ('roe', -0.0005931784918213847), ('v', -0.0004809942553496519), ('is', -0.0004569926878966187), ('gone', -0.0004180907241746933)]

7 :
Liberal:  0.10155437430905313
Conservative:  0.8984456256909469
Predicted:  0
Actual: Conservative




Top words for text response 7:
Liberal words:  [('MAGA', 0.027596976786025882)]
Conservative words:  []

8 :
Liberal:  0.3576212305109313
Conservative:  0.6423787694890687
Predicted:  0
Actual: Liberal
Top words for text response 8:
Liberal words:  [('well', 0.32702712643405596), ('biden', 4.540157586642417e-05), ('couple', 3.829385738588693e-05), ('past', 3.352589848464036e-05), ('the', 3.122924889509688e-05), ('has', 3.116832029671241e-05), ('doing', 3.1118722187138686e-05), ('years', 2.5156443026418673e-05), ('been', 2.3324556708121878e-05), ('of', 2.1914998182988445e-05)]
Conservative words:  []





In [None]:
# get model with vectorizer
# with open("savedModels/svmModel.pkl", "rb") as model_file:
with open("savedModels/politicalCCModel.pkl", "rb") as model_file:
    model, vectorizer = pickle.load(model_file)
vectorizedText = vectorizer.transform(text) # CHANGED HERE
textsTransformed = pd.DataFrame(
    vectorizedText.toarray(), columns=vectorizer.get_feature_names_out()
)
# predict text bias probabilities
pred = model.predict_proba(textsTransformed)
# get most influential words
predict_function = lambda x: model.predict_proba(vectorizer.transform(x))

for i in range(len(pred)):
    explanation = explainer.explain_instance(
        text[i], predict_function, num_features=20
    )
    top_words_lime = explanation.as_list()
    liberalWords = []
    conservativeWords = []
    for word, score in top_words_lime:
        if score > 0:
            liberalWords.append((word, round(score, 3)))
        else:
            conservativeWords.append((word, round(score, 3)))

    # make the amount of liberal and conservative words equal
    liberalWords = liberalWords[:10]
    conservativeWords = conservativeWords[:10]
    if len(liberalWords) != len(conservativeWords):
        liberalWords = liberalWords[
            : min(len(liberalWords), len(conservativeWords))
        ]
        conservativeWords = conservativeWords[
            : min(len(liberalWords), len(conservativeWords))
        ]

    # get original words from stemmed words (map)
    liberalStemmedWords = [stemmer.stem(word) for word, score in liberalWords]
    conservativeStemmedWords = [
        stemmer.stem(word) for word, score in conservativeWords
    ]
    originalLiberalWords = []
    originalConservativeWords = []
    for stemmedWord in liberalStemmedWords:
        for token in text[i].split():
            if stemmer.stem(token) == stemmedWord:
                originalLiberalWords.append(token)
                break
    for stemmedWord in conservativeStemmedWords:
        for token in text[i].split():
            if stemmer.stem(token) == stemmedWord:
                originalConservativeWords.append(token)
                break

    print("Liberal Percentage: ", pred[0][1])
    print("Conservative Percentage: ", pred[0][0])
    print("Liberal Words: ", liberalWords)
    print("Conservative Words: ", conservativeWords)
    print("Original Liberal Words: ", originalLiberalWords)
    print("Original Conservative Words: ", originalConservativeWords)



Liberal Percentage:  0 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('know', 0.061), ('Liberals', 0.044)]
Conservative Words:  [('what', -0.028), ('up', -0.015)]
Original Liberal Words:  ['know', 'Liberals']
Original Conservative Words:  ['up']
Liberal Percentage:  1 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('illegal', 0.027), ('be', 0.005)]
Conservative Words:  [('should', -0.051), ('Abortion', -0.012)]
Original Liberal Words:  ['be']
Original Conservative Words:  ['should', 'Abortion']




Liberal Percentage:  2 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('Trump', 0.132), ('the', 0.012)]
Conservative Words:  [('win', -0.037), ('for', -0.029)]
Original Liberal Words:  ['Trump', 'the']
Original Conservative Words:  ['win', 'for']
Liberal Percentage:  3 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('debt', 0.038), ('biden', 0.031), ('more', 0.016)]
Conservative Words:  [('Thanks', -0.097), ('yessss', -0.077), ('student', -0.033)]
Original Liberal Words:  ['debt', 'biden', 'more']
Original Conservative Words:  ['Thanks', 'student']




Liberal Percentage:  4 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('old', 0.429), ('Trump', 0.172), ('biden', 0.079)]
Conservative Words:  [('is', -0.142), ('if', -0.057), ('too', -0.007)]
Original Liberal Words:  ['old', 'Trump', 'biden']
Original Conservative Words:  ['is', 'if', 'too']
Liberal Percentage:  5 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('roe', 0.429), ('wade', 0.197), ('v', 0.0)]
Conservative Words:  [('thank', -0.15), ('is', -0.069), ('god', -0.056)]
Original Liberal Words:  ['roe', 'wade', 'v']
Original Conservative Words:  ['thank', 'is', 'god']
Liberal Percentage:  6 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  []
Conservative Words:  []
Original Liberal Words:  []
Original Conservative Words:  []
Liberal Percentage:  7 0.2171458321821546
Conservative Percentage:  0.7828541678178453
Liberal Words:  [('years', 0.086), ('been', 0.085)]
Conservative Words:  



In [None]:
accuracyrf = accuracy_score(y_test, predrf)
accuracySVC = accuracy_score(y_test, predSVC)
accuracyCC = accuracy_score(y_test, predCC)
accuracySGD = accuracy_score(y_test, predSGD)
# accuracyPip = accuracy_score(y_test, predPip)
# accuracyEoE = accuracy_score(y_test, predEoE)

cmrf = confusion_matrix(y_test, predrf)
cmSVC = confusion_matrix(y_test, predSVC)
cmCC = confusion_matrix(y_test, predCC)
cmSGD = confusion_matrix(y_test, predSGD)
# cmPip = confusion_matrix(y_test, predPip)
# cmEoE = confusion_matrix(y_test, predEoE)

print("Accuracy Random Forest:", accuracyrf)
print("Confusion matrix Random Forest:\n", cmrf)
print("\nAccuracy SVC:", accuracySVC)
print("Confusion matrix SVC:\n", cmSVC)
print("\nAccuracy Calib. Classifier:", accuracyCC)
print("Confusion matrix Calib. Classifier:\n", cmCC)
print("\nAccuracy Stochastic Gradient Descent:", accuracySGD)
print("Confusion matrix Stochastic Gradient Descent:\n", cmSGD)
# print("\nAccuracy Pipeline:", accuracyPip)
# print("Confusion matrix Pipeline:\n", cmPip)
# print("\nAccuracy Ensemble of Ensembles:", accuracyEoE)
# print("Confusion matrix Ensemble of Ensembles:\n", cmEoE)

Accuracy Random Forest: 0.7696655675930287
Confusion matrix Random Forest:
 [[2024  297]
 [ 681 1244]]

Accuracy SVC: 0.7618935468676401
Confusion matrix SVC:
 [[1835  486]
 [ 525 1400]]

Accuracy Calib. Classifier: 0.7616580310880829
Confusion matrix Calib. Classifier:
 [[1828  493]
 [ 519 1406]]

Accuracy Stochastic Gradient Descent: 0.7633066415449835
Confusion matrix Stochastic Gradient Descent:
 [[1908  413]
 [ 592 1333]]


In [None]:
from numpy import mean
from numpy import std

X_samples= ['''Liberals know what's up''',
            '''Abortion should be illegal.''',
            '''Trump for the win''',
            '''no more student debt yessss. Thanks biden''',
            '''even if Trump is old, biden is too old''',
            '''thank god roe v wade is gone''',
            '''MAGA. MAGA. MAGA''',
            '''biden hasn't been doing too bad the past couple of years''']

y_cgpt = [1, 0, 0, 1, 0, 0, 0, 1] # 0 is conservative (similar to female); 1 is liberal (similar to male)

cv_cgpt = vectorizer.transform(X_samples)
pd_cgpt = pd.DataFrame(data = cv_cgpt.toarray())
X_cgpt = pd_cgpt.iloc[:,:].values

predrfGPT = modelrf.predict(X_cgpt)
predCCGPT = modelCC.predict(X_cgpt)
predSGDGPT = modelSGD.predict(X_cgpt)

# report performance
print("\nAccuracy Random Forest:", accuracy_score(y_cgpt, predrfGPT))
print("Confusion matrix Random Forest (picks liberal all the time):\n", confusion_matrix(y_cgpt, predrfGPT))
print("\nAccuracy Calib. Classifier:", accuracy_score(y_cgpt, predCCGPT))
print("Confusion matrix Calib. Classifier:\n", confusion_matrix(y_cgpt, predCCGPT))
print("\nAccuracy Stochastic Gradient Descent:", accuracy_score(y_cgpt, predSGDGPT))
print("Confusion matrix Stochastic Gradient Descent:\n", confusion_matrix(y_cgpt, predSGDGPT))




Accuracy Random Forest: 0.375
Confusion matrix Random Forest (picks liberal all the time):
 [[3 0]
 [5 0]]

Accuracy Calib. Classifier: 0.625
Confusion matrix Calib. Classifier:
 [[3 0]
 [3 2]]

Accuracy Stochastic Gradient Descent: 0.5
Confusion matrix Stochastic Gradient Descent:
 [[3 0]
 [4 1]]




Word Embeddings/Cosine Similarity Political

In [32]:
# load word vectors
wordVectors = KeyedVectors.load_word2vec_format('../wordEmbeddings/GoogleNews-vectors-negative300.bin', binary=True)
manVector = wordVectors['liberal']
womanVector = wordVectors['conservative']

# get average vector
def text_to_average_vector(text):
    if text.strip() == "":
        return None
    words = text.split()
    vectors = [wordVectors[word] for word in words if word in wordVectors.vocab]
    if len(vectors) == 0:
        return None
    return np.mean(vectors, axis=0)

# use cosine similarity to predict bais
def predict_bias(inputText):
    similaritiesToLiberal = []
    similaritiesToConservative = []
    words = inputText.split()
    
    # go through each word and calculate cosine similarity
    for word in words:
        if word in wordVectors.key_to_index:
            similarityToLiberal = cosine_similarity(wordVectors[word].reshape(1, -1), manVector.reshape(1, -1))
            similarityToConservative = cosine_similarity(wordVectors[word].reshape(1, -1), womanVector.reshape(1, -1))
            similaritiesToLiberal.append(similarityToLiberal)
            similaritiesToConservative.append(similarityToConservative)
    
    # calculate average
    if len(similaritiesToLiberal) == 0 or len(similaritiesToConservative) == 0:
        return None, None
    avgSimilarityToLiberal = np.mean(similaritiesToLiberal)
    avgSimilarityToConservative = np.mean(similaritiesToConservative)
    
    # normalize
    probLiberalBias = (avgSimilarityToLiberal / (avgSimilarityToLiberal + avgSimilarityToConservative)) * 100
    probConservativeBias = (avgSimilarityToConservative / (avgSimilarityToLiberal + avgSimilarityToConservative)) * 100
    
    return probLiberalBias, probConservativeBias

# testing
inputText = """
biden hasn't been doing too bad the past couple of years
"""

addSpaceBeforeText = add_space_before(inputText)
inputText = removeUnnecessaryWords(addSpaceBeforeText)

probMaleBias, probFemaleBias = predict_bias(inputText)

print("Liberal Percentage:", probMaleBias)
print("Conservative Percentage:", probFemaleBias)

if probMaleBias > probFemaleBias:
    bias = "liberal" 
else:
    bias = "conservative"
print("Predicted bias:", bias)

# bias scores for each word
biasScores = {}
words = inputText.split()
for word in words:
    if word in wordVectors.key_to_index:
        similarityToLiberal = cosine_similarity(wordVectors[word].reshape(1, -1), manVector.reshape(1, -1))
        similarityToConservative = cosine_similarity(wordVectors[word].reshape(1, -1), womanVector.reshape(1, -1))
        biasScores[word] = (similarityToLiberal - similarityToConservative)[0][0]
sortedBiasScores = sorted(biasScores.items(), key=lambda item: item[1], reverse=True)

liberalWords = [(word, score) for word, score in sortedBiasScores if score > 0][:10]
conservativeWords = [(word, score) for word, score in sortedBiasScores if score < 0][:10]

print("Liberal words:", liberalWords)
print("Conservative words:", conservativeWords)

Liberal Percentage: 49.984556436538696
Conservative Percentage: 50.015443563461304
Predicted bias: conservative
Liberal words: [('biden', 0.09245667), ('coupl', 0.068678774), ('bad', 0.016252339), ('t', 0.015689336)]
Conservative words: [('hasn', -0.016007911), ('year', -0.08667511), ('past', -0.09083285)]
