In [None]:
# Install the tweet-preprocessor library (used for cleaning and preprocessing tweets)
!pip install tweet-preprocessor -q

# Install the latest version of gensim (a library for topic modeling and document similarity analysis)
!pip install -q -U gensim

# Install pyLDAvis (a Python library for interactive topic model visualization)
!pip install -q pyLDAvis

# Force reinstall numpy to version 1.22.4. This is currently necessary to ensure compatibility or fix certain issues.
!pip install --force-reinstall -q numpy==1.22.4

# Install LIME (Local Interpretable Model-Agnostic Explanations), a library for explaining machine learning model predictions
!pip install -q lime

# Install or update the imbalanced-learn library, useful for dealing with imbalanced datasets
!pip install -U imbalanced-learn -q # needs specific numpy. Can use to artificially make more customers from a low count group, and hence can be trained more accurately on the stuff we need

!pip install gradio -U -q

In [None]:
import pandas as pd
import numpy as np
import preprocessor as prepro # twitter prepro
from tqdm.notebook import tqdm #progress bar # since or things are now gonna take longer, or if we're stuck in somethign or if it's still working

import spacy #spacy for quick language prepro # Very good if you need to use language processing fast, not so much for neutral networks, hence dying a bit also probably.
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler # need to bring it to the same distribution
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction # dimensionality reduction well suited for sparce matrix
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
from lime.lime_text import LimeTextExplainer
from collections import OrderedDict

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

import gradio as gr

In [None]:
# getting rid of annoying warnings from ipykernel
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning) # because warnings are annoying

In [None]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION) # twitter preprocessor and remove all these.
# Could argue if you're losing information, but whatever

In [None]:
# open file
data_pol = pd.read_json('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz')
data_tw = pd.read_json('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz')
# just care about the performance evaluation, we don't care about the causality of what is happening. Which you would care a lot about in statistics and colinearity and so on

In [None]:
data_pol.head()

In [None]:
data_pol['text'] = data_pol.text.str.lower()

In [None]:
data_tw.tweet.head()

In [None]:
data_tw = data_tw.tweet

In [None]:
data_tw

In [None]:
#basic cleanup only for tweets
data_pol['text_clean'] = data_pol['text'].map(lambda t: prepro.clean(t)) # Anonymus functions, .map (expose them 1 by 1) and call them t lambda(don't have to define), prepro.clean(t)
# can also do it with if and so on, + append but it's easier.
data_pol['text_clean'] = data_pol['text_clean'].str.replace('#','') # str has a lot of features. String manipulations in panadas.
data_pol['text_clean'] = data_pol['text_clean'].str.replace('rt','') # prepro.clean might've made it all same size, but otherwise do one for upper or lower caes

In [None]:
data_pol.head()

In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm(total=len(data_pol['text_clean']),position=0, leave=True) # make a progessbar

for text in nlp.pipe(data_pol['text_clean'], disable=["tagger", "parser", "ner"]): # putting all our text into the loop, and skip tagger parser and ner since they're computationally intense

  txt = [token.lemma_.lower() for token in text # lowercase lematized tokens
         if token # if it's characters (and ont the ones below)
         and not token.is_stop # don't want it to be stop words
         and not token.is_punct] # and don't want it to be dots

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
def text_prepro(texts: pd.Series) -> list: # called typing, and then if something breaks, and this ensures that we need a certain type of data coming in and out
    """
    Preprocess a series of texts.

    Parameters:
    - texts: A pandas Series containing the text to be preprocessed.
    - nlp: A spaCy NLP model.

    Returns:
    - A list of preprocessed texts.

    Steps:
    - Clean twitter-specific characters using a predefined 'prepro' method.
    - Normalize the text by lowercasing and lemmatizing.
    - Remove punctuations, stopwords, and non-alphabet characters.
    """
# nlp can be used for if we have different languages
    # Clean twitter-specific characters and other special characters

    texts_cleaned = texts.str.lower()
    texts_cleaned = texts.map(prepro.clean).str.replace('#', '')
    texts_cleaned = texts_cleaned.str.replace('#', '')

    # Initialize container for the cleaned texts
    clean_container = []

    # Use tqdm for a progress bar
    pbar = tqdm(total=len(texts_cleaned), position=0, leave=True)

    # Use spaCy's nlp.pipe for efficient text processing
    for doc in nlp.pipe(texts_cleaned, disable=["tagger", "parser", "ner"]):

        # Extract lemmatized tokens that are not punctuations, stopwords, or non-alphabetic
        tokens = [token.lemma_.lower() for token in doc
                  if token and not token.is_stop and not token.is_punct]

        clean_container.append(" ".join(tokens))

        pbar.update(1)

    return clean_container


In [None]:
data_pol['text_clean'] = text_prepro(data_pol["text"])

In [None]:
data_df = pd.DataFrame({'label':data_pol['labels'], "text":data_pol["text_clean"]})

In [None]:
data_df.head()

In [None]:
data_df.label.value_counts().reset_index()

In [None]:
# Count and reset index
data_chart = data_df.label.value_counts().reset_index().rename(columns={'index': 'Category', 'label': 'N Tweets'})

# Replace numerical categories with textual descriptions
data_chart['Category'] = data_chart['Category'].map({0: 'repuplican', 1: 'democrat'}) # basically just look at this for the groupings

# Plot the chart
chart = alt.Chart(data_chart).mark_bar(filled=True).encode(
    alt.X('N Tweets:Q', title='N Tweets'),
    alt.Y('Category:O', title='Category', sort='-x'),
    color=alt.Color('Category:N', legend=alt.Legend(title="Label Types"), scale=alt.Scale(
        domain=['repuplican', 'democrat'],
        range=['red', 'green']
    ))
)

chart # highly imbalanced dataset, will be hard to teach the model to recognice and separate hatespeech from offensive

In [None]:
# fixing sample imbalance
rus = RandomUnderSampler(random_state=42)
data_df_res, y_res = rus.fit_resample(data_df, data_df['label']) # for numerical data you can use better models. Making fake data would work better (for the different classes)

In [None]:
data_df_res['label'].value_counts() # still enough to train a model # losing a bunch of data, but proabably getting better results in the end # 20 and 80% are on the edge, but usually it's fine even if it's 40-60

In [None]:
# Splitting the dataset into the Training set and Test set (since we have a new output variable) # can also use an eval set around 10% and only in the end we use test set
X_train, X_test, y_train, y_test = train_test_split(data_df_res['text'], y_res, test_size = 0.4, random_state = 42) # best learning material is basically just the documentation (in relation to the imbalanced dataset but also other things we do)


In [None]:
#instantiate models and "bundle up as pipeline"

tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components = 100) # squeeze the factors into about 100 factors
cls_xg = XGBClassifier()

pipe_xg = make_pipeline(tfidf, svd, cls_xg)

In [None]:
pipe_xg.fit(X_train,y_train) # fit model

In [None]:
# pipe_xg.fit(X_train,y_train) # fit model

In [None]:
# evaluate model performance on training set

y_eval = pipe_xg.predict(X_train)
report = classification_report(y_train, y_eval)
print(report)

In [None]:
# evaluate model performance on training set

y_eval = pipe_xg.predict(X_test)
report = classification_report(y_test, y_eval)
print(report)

In [None]:
# prepro.clean2

In [None]:
cleaned_twitter_data = data_tw

In [None]:
# predict
programmed_tw = text_prepro(data_tw)
new_tw = pipe_xg.predict(programmed_tw) # make with xgb

In [None]:
data_tw

In [None]:
# Let's explain the result:

class_names = ["repuplican", "democrat"]

explainer = LimeTextExplainer(class_names = class_names)

In [None]:
exp = explainer.explain_instance(programmed_tw[3], pipe_xg.predict_proba, num_features = 10, top_labels=3) # technically we skipped the preprocessing. How many words does the model need to explain = num_features = 10

In [None]:
exp.show_in_notebook(text=True)

In [None]:
exp = explainer.explain_instance(data_tw[3], pipe_xg.predict_proba, num_features = 10, top_labels=3) # technically we skipped the preprocessing. How many words does the model need to explain = num_features = 10

In [None]:
exp.show_in_notebook(text=True)

In [None]:
cleaned_twitter_data2 = pd.DataFrame(cleaned_twitter_data)

In [None]:

cleaned_twitter_data2 ["label"] = new_tw

In [None]:
cleaned_twitter_data2.head()

In [None]:
tw_demo = cleaned_twitter_data2[cleaned_twitter_data2['label']== 1]
tw_repu = cleaned_twitter_data2[cleaned_twitter_data2['label']== 0]

In [None]:
print(tw_demo.head())
print(tw_repu.head())

In [None]:
# preprocess texts (we need tokens)
tokensrepu = []

for text in nlp.pipe(tw_repu['tweet'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in text
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV']
              and not token.is_stop
              and not token.is_punct]
  tokensrepu.append(proj_tok)

In [None]:
# preprocess texts (we need tokens)
tokensdemo = []

for text in nlp.pipe(tw_demo['tweet'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in text
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV']
              and not token.is_stop
              and not token.is_punct]
  tokensdemo.append(proj_tok)

In [None]:
tw_repu['tokens'] = tokensrepu
tw_demo['tokens'] = tokensdemo

In [None]:
# Create a Dictionary from the articles: dictionary
dictionaryr = Dictionary(tw_repu['tokens']) # from gensin (look earlier)
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionaryr.filter_extremes(no_below=5, no_above=0.3, keep_n=1000) # no_above, so don't keep anything that occours more than 50% and don't keep anyhting that happens less than 5
# construct corpus using this dictionary
corpusr = [dictionary.doc2bow(doc) for doc in tw_repu['tokens']]

In [None]:
# Training the model
lda_modelr = LdaMulticore(corpusr, id2word=dictionaryr, num_topics=10, workers = 4, passes=10)

In [None]:
lda_modelr.print_topics()

In [None]:
# Create a Dictionary from the articles: dictionary
dictionaryd = Dictionary(tw_demo['tokens']) # from gensin (look earlier)
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionaryd.filter_extremes(no_below=5, no_above=0.3, keep_n=1000) # no_above, so don't keep anything that occours more than 50% and don't keep anyhting that happens less than 5
# construct corpus using this dictionary
corpusd = [dictionaryd.doc2bow(doc) for doc in tw_demo['tokens']]

In [None]:
# Training the model
lda_modeld = LdaMulticore(corpusd, id2word=dictionaryd, num_topics=10, workers = 4, passes=10)

In [None]:
lda_modeld.print_topics()

In [None]:
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
 # Let's Visualize
pyLDAvis.display(lda_display) # left is basically a PCA map size represents how many documents are in tha topic

In [None]:
def text_prepro2(texts: pd.Series) -> list: # called typing, and then if something breaks, and this ensures that we need a certain type of data coming in and out
    """
    Preprocess a series of texts.

    Parameters:
    - texts: A pandas Series containing the text to be preprocessed.
    - nlp: A spaCy NLP model.

    Returns:
    - A list of preprocessed texts.

    Steps:
    - Clean twitter-specific characters using a predefined 'prepro' method.
    - Normalize the text by lowercasing and lemmatizing.
    - Remove punctuations, stopwords, and non-alphabet characters.
    """
# nlp can be used for if we have different languages
    # Clean twitter-specific characters and other special characters

    texts_cleaned = texts.str.lower()
    texts_cleaned = texts.map(prepro.clean).str.replace('#', '')
    texts_cleaned = texts_cleaned.str.replace('#', '')

    # Initialize container for the cleaned texts
    clean_container = []


    # Use spaCy's nlp.pipe for efficient text processing
    for doc in nlp.pipe(texts_cleaned, disable=["tagger", "parser", "ner"]):

        # Extract lemmatized tokens that are not punctuations, stopwords, or non-alphabetic
        tokens = [token.lemma_.lower() for token in doc
                  if token and not token.is_stop and not token.is_punct]

        clean_container.append(" ".join(tokens))



    return clean_container


In [None]:
def predictpolitical(placetext):
  text_rdy = []
  text_rdy = text_prepro2(pd.Series(placetext))
  result = pipe_xg.predict(text_rdy) # can also use predict pobability
  result2= pipe_xg.predict_proba(text_rdy)
  if result == 1:
    return "Democrat, " "probability = " + str(result2[0][1].round(2))
  if result == 0:
    return "Republican " "probability = " + str(result2[0][0].round(2))


predictpolitical("Hello you little person")

In [None]:
# Create a Gradio interface with custom names for categorical values and radio for all features
interface = gr.Interface(                  # call the interface of gradio
    fn=predictpolitical,                       # define the function it should use
    inputs=[gr.Textbox(label = "Insert text to find any political lean")
    ],
    outputs="text",
    title="Find political lean",
)

interface.launch()
