In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from textblob import TextBlob




: 

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Student\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df=pd.read_csv(r"C:\Users\Student\Desktop\JoeBiden.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [7]:
missing_values=df.isnull().sum()


In [None]:
print(missing_values)

In [10]:
df=df[["content","date","likeCount","retweetCount"]].dropna()

In [None]:
print(df)

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  
    text = re.sub(r"\s+", " ", text).strip() 
    return text



In [14]:
def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)  
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [15]:
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

In [16]:
df['cleaned_content']=df['content'].apply(clean_text)
df['tokens'] = df['cleaned_content'].apply(tokenize_and_remove_stopwords)
df['stemmed_tokens'] = df['tokens'].apply(stem_tokens)


In [17]:
all_tokens = [token for tokens in df['stemmed_tokens'] for token in tokens]
word_freq = Counter(all_tokens)


In [None]:
print(word_freq.most_common(10))

In [20]:
common_words = word_freq.most_common(20)


In [None]:
print(df[['content', 'tokens', 'stemmed_tokens']].head())

In [None]:
print(df[['content', 'tokens', 'stemmed_tokens']].tail())

In [23]:
df['tokens']=df['cleaned_content'].apply(tokenize_and_remove_stopwords)

In [None]:
print(df[["content","tokens"]].head())

In [26]:
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'


In [27]:
df['sentiment'] = df['cleaned_content'].apply(get_sentiment)


In [None]:
print(df['sentiment'].value_counts())


In [None]:
sns.countplot(data=df, x='sentiment')
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()


In [None]:
df['date'] = pd.to_datetime(df['date'])
sentiment_over_time = df.groupby([df['date'].dt.date, 'sentiment']).size().unstack().fillna(0)
sentiment_over_time.plot(figsize=(12,6), title='Sentiment Over Time')


In [None]:
df.groupby('sentiment')['likeCount'].mean().plot.bar(title='Average Likes by Sentiment')


In [None]:
words, counts = zip(*common_words)
plt.figure(figsize=(10,6))
plt.bar(words, counts)
plt.title("Top 20 Most Common Words")
plt.xticks(rotation=45)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [51]:
X = df['tokens'].apply(lambda x: ' '.join(x))

In [52]:

y = df['sentiment']  

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [55]:
smote = SMOTE(random_state=42)

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_bal, y_train_bal)

In [57]:
y_pred_rf = model_rf.predict(X_test_vec)

In [None]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))