In [5]:
import matplotlib.pyplot as plt

import seaborn as sns
import nltk
import pandas as pd


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag, bigrams
from nltk.sentiment import SentimentIntensityAnalyzer
import string
import numpy as np
import time
from textblob import TextBlob
import re
from bs4 import BeautifulSoup
from joblib import Parallel, delayed, dump, cpu_count
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import reciprocal, uniform

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import chi2, SelectKBest


from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.svm import SVC


from collections import Counter

In [None]:
#
# Load CSV into DataFrame using pandas
df = pd.read_csv('/Users/alfaizahmed/Documents/reddit climate change/the-reddit-climate-change-dataset-comments.csv')


dump(df, 'dataframe.joblib')


In [None]:
col_remove2 = df.drop(columns=['sentiment', 'type', 'permalink'], inplace=True)
col_remove2

In [None]:
df = df.sample(frac = 0.02, random_state = 42)

In [None]:
df

In [None]:
num_cores = cpu_count()
print(num_cores)


In [None]:



def some_function(i):
    return i * i

results = Parallel(n_jobs=num_cores)(delayed(some_function)(i) for i in range(10))



In [None]:


# Initialize VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

def compute_sentiment(text):
    return sia.polarity_scores(text)['compound']


num_cores = 6

# find sentiment scores for each chunk in parallel
results = Parallel(n_jobs=num_cores)(
    delayed(compute_sentiment)(text) for text in df['body']
)

df['compound_score'] = results


In [None]:

conditions = [
    df['compound_score'] > 0.05,
    (df['compound_score'] >= -0.05) & (df['compound_score'] <= 0.05),
    df['compound_score'] < -0.05
]
choices = ['positive', 'neutral', 'negative']

df['sentiment_label'] = np.select(conditions, choices)

# PRE PROCESSING

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove HTML tags and URLs
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)
    
    # Lowercase the text
    text = text.lower()
    # Remove commas and inverted commas
    
    text = text.replace(',', '').replace('\'', '').replace('"', '')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    words = word_tokenize(text)
    
     # POS tagging
    pos_tags = pos_tag(words)
    pos_tags_str = ["_".join(tup) for tup in pos_tags]  # combining word and its POS tag


    # Remove stopwords and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    
    # Create bigrams
    bigrams_list = list(bigrams(words))
    bigrams_str = ["_".join(bigram) for bigram in bigrams_list]  # combining words in bigrams with underscore


    # Join the words back into a string
    return " ".join(words)

In [None]:

def process_chunk(chunk):
    chunk['body'] = chunk['body'].apply(preprocess_text)
    return chunk

In [None]:


def main(df):
    chunksize = 10 ** 4

    # Split the DataFrame into chunks
    num_chunks = len(df) // chunksize + (1 if len(df) % chunksize else 0)
    chunks = np.array_split(df, num_chunks)

    start_time = time.time()

    # Use joblib to parallelize the processing of chunks
    processed_chunks = Parallel(n_jobs=6)(delayed(process_chunk)(chunk) for chunk in chunks)
    
    # Concatenate the processed chunks
    processed_df = pd.concat(processed_chunks)

    end_time = time.time()

    print("Preprocess took", end_time - start_time, "seconds.")
    
    
    return processed_df



In [None]:
df_processed = main(df)

In [None]:
prepared_df = df_processed.copy()

In [None]:
visualisation = df_processed.copy()

In [None]:
prepared_df.head()

In [None]:
# Check for missing values
missing_values = prepared_df.isnull().sum()

# Check data types
data_types = prepared_df.dtypes

# Summary statistics for numerical columns
summary_statistics = prepared_df.describe()

missing_values, data_types, summary_statistics


In [None]:
#find out the rows with the missing values
missing_sentiment_rows = prepared_df[prepared_df['body'].isnull()]
missing_sentiment_rows


In [None]:
# Convert the 'created_utc' column to a readable datetime format

visualisation['created_utc'] = pd.to_datetime(visualisation['created_utc'], unit='s')


In [None]:
prepared_df.info()

# Visualisation

In [None]:
temp = prepared_df.groupby('sentiment_label').count()['body'].reset_index().sort_values(by='body', ascending=False)
temp.style.background_gradient(cmap='Purples')


In [None]:
# Setting up the plotting style and size
sns.set_style("whitegrid")
plt.figure(figsize=(18, 12))

# Plotting distribution of sentiment
plt.subplot(2, 2, 1)
sns.histplot(visualisation['compound_score'], bins=50, kde=True)
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')





In [None]:
# Plotting distribution of comments over time
plt.subplot(2, 2, 3)
visualisation.resample('M', on='created_utc').size().plot()
plt.title('Number of Comments Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Comments')



In [None]:

# Calculate the number of comments for each subreddit
subreddit_comment_volume = visualisation['subreddit.name'].value_counts().head(10)

# Plotting the top 10 subreddits by comment volume
plt.figure(figsize=(12, 6))
subreddit_comment_volume.plot(kind='bar', color='lightcoral')
plt.title('Top 10 Subreddits by Comment Volume')
plt.xlabel('Subreddit')
plt.ylabel('Number of Comments')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Resampling data to get average sentiment per month
avg_sentiment_time = visualisation.resample('M', on='created_utc')['compound_score'].mean()

# Plotting the trend
plt.figure(figsize=(14, 6))
avg_sentiment_time.plot()
plt.title('Trend of Average Comment Sentiment Over Time')
plt.xlabel('Date')
plt.ylabel('Average Sentiment')
plt.grid(True)
plt.show()


In [None]:
# Calculate the average sentiment score for the top 10 subreddits
subreddit_sentiment = visualisation.groupby('subreddit.name').compound_score.mean()
top_subreddits_sentiment = subreddit_sentiment[subreddit_comment_volume.index]

# Plotting the average sentiment scores for the top 10 subreddits
plt.figure(figsize=(12, 6))
top_subreddits_sentiment.plot(kind='bar', color='lightseagreen')
plt.title('Average Sentiment Score for Top 10 Subreddits')
plt.xlabel('Subreddit')
plt.ylabel('Average Sentiment Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Calculate the average sentiment and score for NSFW and non-NSFW subreddits
nsfw_sentiment = visualisation.groupby('subreddit.nsfw').compound_score.mean()
nsfw_score = visualisation.groupby('subreddit.nsfw').score.mean()

# Plotting the average sentiment and score for NSFW vs. non-NSFW subreddits
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot for Sentiment
nsfw_sentiment.plot(kind='bar', ax=ax1, color=['lightblue', 'lightcoral'])
ax1.set_title('Average Sentiment Score by NSFW Category')
ax1.set_xlabel('NSFW')
ax1.set_ylabel('Average Sentiment Score')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)

# Plot for Score
nsfw_score.plot(kind='bar', ax=ax2, color=['lightblue', 'lightcoral'])
ax2.set_title('Average Comment Score by NSFW Category')
ax2.set_xlabel('NSFW')
ax2.set_ylabel('Average Comment Score')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()


# FEATURE EXTRACTION 

In [None]:


# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer( stop_words='english',max_features= 1000)

# Transform the processed_text column into TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(prepared_df['body'])

tfidf_features.shape


In [None]:

encoder = LabelEncoder()
encoded_sentiments = encoder.fit_transform(prepared_df['sentiment_label'])


In [None]:

# Calculate chi-square scores
chi2_scores, p_values = chi2(tfidf_features, encoded_sentiments)

# Let's select the top 1000 features (or adjust based on your needs)
k_best = SelectKBest(chi2, k = 1000)
X_chi2_selected = k_best.fit_transform(tfidf_features, encoded_sentiments)


# MULTINOMIAL LOGISTIC REGRESSION

In [None]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, encoded_sentiments, test_size=0.2, random_state=42)


In [None]:
model = LogisticRegression(solver='lbfgs', multi_class='multinomial',max_iter=1000, random_state=42, n_jobs=6)


In [None]:
# Model Training
model.fit(X_train, y_train)

In [None]:
# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,y_pred)


In [None]:
print(accuracy)
print(classification_rep)



In [None]:
print(matrix)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(matrix, annot=True, fmt="d", cmap="Blues",
            square=True, linewidths=.5, cbar_kws={"shrink": .75})

# Set labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

# Display
plt.show()

# Hybrid Logistic Regression

In [None]:


# Rule-Based Model using SentiWordNet
def get_wordnet_pos(treebank_tag):
    """Map treebank pos tags to WordNet pos tags."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN  # Default to noun

def sentiwordnet_predict(text):
    sentiment = 0.0
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        synsets = wn.synsets(word, pos=wn_tag)
        if not synsets:
            continue
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
    
    # Return sentiment category
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'





In [None]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(prepared_df['body'], prepared_df['sentiment_label'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english',max_features= 1000)
X_train_vec = tfidf_vectorizer.fit_transform(X_train)
X_test_vec = tfidf_vectorizer.transform(X_test)




In [None]:
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs = 6)
clf.fit(X_train_vec, y_train)
ml_predictions = clf.predict(X_test_vec)

# Get predictions from SentiWordNet
swn_predictions = [sentiwordnet_predict(text) for text in X_test]



In [None]:
# Since we have 3 classes, we'll use a majority vote function for the final predictions
def majority_vote(ml_pred, swn_pred):
    from collections import Counter
    return Counter([ml_pred, swn_pred]).most_common(1)[0][0]

final_predictions = [majority_vote(ml, swn) for ml, swn in zip(ml_predictions, swn_predictions)]

# Evaluate the model
accuracy = accuracy_score(y_test, final_predictions)
print("Accuracy:", accuracy)

In [None]:
classification_hybrid = classification_report(y_test, final_predictions)
matrix2 = confusion_matrix(y_test,final_predictions)


In [None]:
print(classification_hybrid)
print(matrix2)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(matrix2, annot=True, fmt="d", cmap="Blues",
            square=True, linewidths=.5, cbar_kws={"shrink": .75})

# Set labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

# Display
plt.show()

# Random Forest Classifier

In [None]:



# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, encoded_sentiments, test_size=0.2, random_state=42)





In [None]:
# Model Selection: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs= 6)



In [None]:
# Model Training
rf_model.fit(X_train, y_train)

# Model Evaluation


In [None]:
y_pred_rf = rf_model.predict(X_test)


In [None]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)


In [None]:
print(accuracy_rf)
print(classification_rep_rf)

In [None]:
matrix3 = confusion_matrix(y_test,y_pred_rf)

In [None]:
print(matrix3)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(matrix3, annot=True, fmt="d", cmap="Blues",
            square=True, linewidths=.5, cbar_kws={"shrink": .75})

# Set labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

# Display
plt.show()

# Multinomial Naive Bayes


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Define the model and parameters
mnb = MultinomialNB()
param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0]}





In [None]:
# Create a GridSearchCV object
grid_search_mnb = GridSearchCV(mnb, param_grid, cv=5, n_jobs= 6)
grid_search_mnb.fit(X_train, y_train)

In [None]:
# Best parameters and accuracy
print('Best parameters for MultinomialNB:', grid_search_mnb.best_params_)
print('Best cross-validation score:', grid_search_mnb.best_score_)

In [None]:
# Evaluate on the test set
y_pred_mnb = grid_search_mnb.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_pred_mnb))

In [None]:
classification_mnb = classification_report(y_test, y_pred_mnb)
matrix4 = confusion_matrix(y_test,y_pred_mnb)


In [None]:
print(classification_mnb)
print(matrix4)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(matrix4, annot=True, fmt="d", cmap="Blues",
            square=True, linewidths=.5, cbar_kws={"shrink": .75})

# Set labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

# Display
plt.show()

# Support Vector Machine with Hyper parameter

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import reciprocal, uniform

# Assuming X_train, y_train are your data
param_distributions = {
    'C': uniform(0.1, 10),
    'gamma': reciprocal(0.01, 1),
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}



In [None]:
random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_distributions,
    n_iter=50,
    cv=5,
    verbose=2,
    n_jobs= 6,  # Parallel processing
    random_state=42
)




In [None]:
random_search.fit(X_train, y_train)

# Print the best parameters found


In [None]:
print(random_search.best_params_)

In [None]:
# Evaluate on the test set
y_pred_svm = random_search.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_pred_svm))

# Topic Modelling

In [None]:
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords


# 1. Data Preprocessing
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

processed_docs = prepared_df['body'].map(preprocess)



In [None]:
# 2. Create a Dictionary & Filter Extremes
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)



In [None]:
# 3. Create Bag-of-Words Corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# 4. Train the LDA Model
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=8, 
                                       id2word=dictionary,                                    
                                       passes=10, 
                                       workers=2)

# 5. Display Topics
topics = lda_model.print_topics(-1)
topics
