# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Models
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited April 25, 2024)
```

# Set Up

## Packages

In [4]:
import matplotlib.pyplot as plt

#BERT transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch

In [None]:
# Define the base path
base_path = "C:/Users/Andre/Box/DS5001 Final Project/"

## Import data

In [None]:
LIB = pd.read_csv(base_path + "output/LIB.csv", delimiter = "|")
CORPUS = pd.read_csv(base_path + "output/CORPUS.csv", delimiter = "|")
VOCAB = pd.read_csv(base_path + "output/TFIDF_L2.csv", delimiter = "|")
BOW_sentence = pd.read_csv(base_path + "output/BOW_sentence.csv", sep='|', index=True)
TFIDF = pd.read_csv(base_path + "output/TFIDF_L2.csv", delimiter = "|")
TFIDF_L2 = pd.read_csv(base_path + "output/TFIDF_L2.csv", delimiter = "|")

# Models

## Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

# Number of PCA components
n_components = 5

# Initialize PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(TFIDF_L2)

# Create a DataFrame with the principal components
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])

# Getting the PCA components (i.e., eigenvectors)
components = pca.components_

# Relate components to original features
terms = TFIDF.columns
PCA_COMPONENTS = pd.DataFrame(components, columns=terms, index=[f'PC{i+1}' for i in range(5)])
PCA_COMPONENTS = PCA_COMPONENTS.T
PCA_COMPONENTS

In [None]:
# Initialize PCA
PCA = PCA(n_components=n_components)

# Fit PCA on the normalized TFIDF data and transform the data
document_components = pca.fit_transform(TFIDF_L2.values)

# Create a DataFrame for the document-component matrix
PCA_DCM = pd.DataFrame(document_components, index=TFIDF_L2.index, columns=[f'PC{i+1}' for i in range(n_components)])

# Merge with LIB
PCA_DCM = LIB.merge(PCA_DCM, left_index=True, right_index=True)
PCA_DCM

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# Scatterplot of documents
fig.add_trace(go.Scatter(
    x=PCA_DCM['PC1'],
    y=PCA_DCM['PC2'],
    mode='markers',
    
    text=PCA_DCM['overallcomments'],  # Assuming you want to see comments on hover
    name='Documents'
))

# Update layout with titles and labels
fig.update_layout(
    title='PCA Analysis: Documents and Loadings',
    xaxis_title='Component 1',
    yaxis_title='Component 2'
)

In [None]:
fig = go.Figure()

# Scatterplot of documents
fig.add_trace(go.Scatter(
    x=PCA_DCM['PC2'],
    y=PCA_DCM['PC3'],
    mode='markers',
    
    text=PCA_DCM['overallcomments'],  # Assuming you want to see comments on hover
    name='Documents'
))

# Update layout with titles and labels
fig.update_layout(
    title='PCA Analysis: Documents and Loadings',
    xaxis_title='Component 2',
    yaxis_title='Component 3'
)

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(PCA_COMPONENTS['PC1'], PCA_COMPONENTS['PC2'], color='blue', alpha=0.5)
plt.title('PCA - Loadings Plot on First Two Principal Components')
plt.xlabel('Principal Component 1 Loadings')
plt.ylabel('Principal Component 2 Loadings')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(PCA_COMPONENTS['PC2'], PCA_COMPONENTS['PC3'], color='blue', alpha=0.5)
plt.title('PCA - Loadings Plot on First Two Principal Components')
plt.xlabel('Principal Component 2 Loadings')
plt.ylabel('Principal Component 3 Loadings')
plt.grid(True)
plt.show()

## Topic Models

In [None]:
# Filter out tokens with undesired POS tags including all forms of punctuation and cardinal numbers
unwanted_tags = ['NNP', 'PRP', 'PRP$', 'WP', 'WP$', '.', ',', ':', '``', "''", '(', ')', '#', 'CD', '$'] 

# Apply the filter to the DataFrame
CORPUS_LIM = CORPUS[~CORPUS['pos'].isin(unwanted_tags)]

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(CORPUS_LIM['term_str'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(dtm)

In [None]:
# Topic-Term Distribution (PHI)
phi_df = pd.DataFrame(lda.components_, columns=vectorizer.get_feature_names_out())

# Document-Topic Distribution (THETA)
theta_df = pd.DataFrame(lda.transform(dtm))

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 10)

In [None]:
'''
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

def get_sentiment(text):
    # Encode the text using tokenizer
    encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    # Get model output
    with torch.no_grad():
        output = model(**encoded_input)
    # Calculate probabilities using softmax
    probabilities = softmax(output.logits, dim=1)
    # Extract scores for each sentiment class
    sentiment_scores = probabilities.numpy().flatten()
    labels = ['negative', 'neutral', 'positive']
    # Create a dictionary of labels and their corresponding scores
    sentiment_result = dict(zip(labels, sentiment_scores))
    return sentiment_result

# Step 2: Apply sentiment analysis
SENTENCES['sentiment'] = SENTENCES['sentence'].apply(get_sentiment)

# Step 3: Optionally merge this back with the original dataframe if needed
CORPUS_W_SENT = pd.merge(CORPUS.drop(columns='term_str'), SENTENCES, on=['document_id', 'sentence_num'], how='left')

## Sentiment Analysis

### VOCAB

In [None]:
#Get the sentiment analyzer
sia = SentimentIntensityAnalyzer()

#Define a function to get the compund sentiment score
def get_sentiment(term):
    score = sia.polarity_scores(term)
    return score['compound']  # Return the compound score for simplicity

In [None]:
#Apply the function to the VOCAB table
VOCAB['sentiment'] = VOCAB.index.map(get_sentiment)

#Look at non-zero instances
VOCAB.query('sentiment != 0')

### BOW

## Word Embeddings (word2vec)

In [None]:
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne
import plotly_express as px

In [None]:
CORPUS = CORPUS.set_index(['document_id', 'sentence_num', 'token_num'])

In [None]:
BAG = 'document_id'

In [None]:
vocab = Dictionary(docs)

In [None]:
#Gensim-style document
docs = CORPUS[~CORPUS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] # Lose single word docs

In [None]:
# word2vec parameters
w2v_params = dict(
    window = 5,
    vector_size = 246,
    min_count = 50, # THIS LIMITS OUR VOCAB
    workers = 4
)

In [None]:
model = word2vec.Word2Vec(docs, **w2v_params)

### Visualize with tSNE

In [None]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec

In [None]:
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)

In [None]:
WV

### Use ScikitLearn's TSNE library

In [None]:
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)

In [None]:
tsne_model = tsne_engine.fit_transform(WV.to_numpy())

In [None]:
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)

In [None]:
TSNE

In [None]:
# Filter out tokens with undesired POS tags including all forms of punctuation and cardinal numbers
unwanted_tags = ['NNP', 'PRP', 'PRP$', 'WP', 'WP$', '.', ',', ':', '``', "''", '(', ')', '#', 'CD', '$'] 

X = TSNE.join(VOCAB[~VOCAB['max_pos'].isin(unwanted_tags)], how='left')

In [None]:
px.scatter(X.reset_index(), 'x', 'y', 
           text='term_str', 
           color='max_pos', 
           hover_name='term_str',          
           size='dfidf',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')