<a href="https://colab.research.google.com/github/abdou19-97/TweetSentement/blob/main/TeslaTweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install Transformers 
# !pip install scipy
import pandas as pd
import tweepy
from textblob import TextBlob
from wordcloud import wordcloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from scipy.special import softmax
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/TeslaStockTweet/stock_tweets.csv')
df.head(5)

In [None]:
# Remove any rows with missing or invalid data
df = df.dropna()
# Remove any non-alphanumeric characters and convert to lowercase
df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.lower()))
# Tokenize the text into words
df['tokens'] = df['Tweet'].apply(lambda x: word_tokenize(x))


In [None]:
df.head()

In [None]:
# Group the tweets by date
daily_counts = df.groupby('Date').size().reset_index(name='counts')

In [None]:
# Remove stop short words
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [w for w in x if w not in stop_words and len(w) > 2])
# Convert the tokens back to a string
df['clean_text'] = df['tokens'].apply(lambda x: ' '.join(x))


In [None]:
df.head()

In [None]:
# Sample the dataset
df = df.sample(frac=1, random_state=1)
df.columns

In [None]:
# Add columns for subjectivity and polarity
# Polarity refers to whether the tweet is positive or negative 
# Subjectivity refers to personal opinion, feelings. etc...
df['Subj'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df['Polar'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
df.head(5)

In [None]:
# Plot the word cloud
allwords = ' '.join([tweet for tweet in df['clean_text']])
wordcloud = wordcloud.WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allwords)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#create a function to compute negative neutral and positive analysis
def getAnalysis(score):
  if score < 0:
    return -1
  elif score > 0:
    return 1
  else:
    return 0

#create new column called analysis and set it to polarity 
df['Score'] = df['Polar'].apply(getAnalysis)

#show the dataframe
df.head(10)

In [None]:
from numpy.ma import count
#print all of the positive tweets
#Print positive tweets
# positive_tweets = df[df.Score == "1"]['clean_text']
# for i, tweet in enumerate(positive_tweets):
#     print(f"{i+1}) {tweet}\n")

In [None]:
#Print positive tweets
positive_tweets = df[df.Score == 1]['clean_text']
positive = round((len(positive_tweets) / len(df)) * 100, 1)
print(f"Percentage of positive tweets: {positive}%")

In [None]:
# Print negative tweets
negative_tweets = df[df.Score == -1]['clean_text']
n_negative = round((len(negative_tweets) / len(df)) * 100, 1)
print(f"Percentage of negative tweets: {n_negative}%")

In [None]:
neutral_tweets = df[df.Score == 0]['clean_text']
neutral_tweets = round((len(neutral_tweets) / len(df)) * 100, 1)
print(f"Percentage of neutral tweets: {neutral_tweets}%")

Percentage of neutral tweets: 37.6%


In [None]:
#show the value counts
df['Score'].value_counts()
#Visualize the counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('count')
df['Score'].value_counts().plot(kind='bar')
plt.show()

In [None]:
# Convert the date column to a datetime object
# df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
## Define a list of contextual words to filter out
contextual_words = ['tsla','tesla','amzn','nio','elon musk','tsm','stock', 'stocks', 'market', 'trading', 'finance', 'investment', 'investing', 'portfolio', 'trade', 'shares', 'company', 'companies', 'investor']

In [None]:
# Define a function to filter out non-contextual words
def filter_words(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in contextual_words]
    return filtered_tokens

# Define a function to calculate the sentiment score for a list of tokens
def get_sentiment_score(tokens):
    sentiment_score = 0
    for token in tokens:
        blob = TextBlob(token)
        sentiment_score += blob.sentiment.polarity
    return sentiment_score


In [None]:
# Create a new column for the filtered tokens
df['filtered_tokens'] = df['tokens'].apply(filter_words)

# Calculate the sentiment score for each tweet
df['sentiment_score'] = df['filtered_tokens'].apply(get_sentiment_score)

# Group the tweets by date and calculate the mean sentiment score for each day
daily_sentiment = df.groupby('Date')['sentiment_score'].mean().reset_index(name='sentiment_score')

# Merge the daily counts and daily sentiment into a single dataframe
daily_data = pd.merge(daily_counts, daily_sentiment, on='Date')

In [None]:
df.head(10)

In [None]:
#Print positive tweets
positive_tweets = df[df.Score == 1]['filtered_tokens']
positive = round((len(positive_tweets) / len(df)) * 100, 1)
print(f"Percentage of positive tweets: {positive}%")

In [None]:
# Print negative tweets
negative_tweets = df[df.Score == -1]['filtered_tokens']
n_negative = round((len(negative_tweets) / len(df)) * 100, 1)
print(f"Percentage of negative tweets: {n_negative}%")

In [None]:
neutral_tweets = df[df.Score == 0]['filtered_tokens']
neutral_tweets = round((len(neutral_tweets) / len(df)) * 100, 1)
print(f"Percentage of neutral tweets: {neutral_tweets}%")