In [1]:
import warnings
warnings.filterwarnings("ignore")
# This code is used to suppress warnings in the console output.
# The `warnings` module is imported, and the `filterwarnings()` function is called with the argument "ignore".
# This sets the warning filter to ignore all warnings, preventing them from being displayed in the console.

# Preprocessing Twitter Data

In [2]:
import nltk
nltk.download('stopwords', quiet = True)

True

In [3]:
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    # Remove hashtags and mentions
    tweet = re.sub(r"#\w+|\@\w+", "", tweet)
    
    # Tokenize tweet
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Remove punctuation and special characters
    filtered_tweet = " ".join(filtered_tokens)
    filtered_tweet = re.sub(r"[^\w\s]", "", filtered_tweet)
    
    return filtered_tweet

# Example usage
tweet = "Excited to try out the new product! #awesome #innovation"
preprocessed_tweet = preprocess_tweet(tweet)
print(preprocessed_tweet)

Excited try new product 


# Lexicon-Based approaches for twitter sentiment analysis 

In [4]:
nltk.download('vader_lexicon', quiet = True)

from nltk.sentiment import SentimentIntensityAnalyzer

def analyze_sentiment(tweet):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(tweet)
    
    # Extract the compound score
    sentiment_score = sentiment_scores["compound"]
    
    if sentiment_score >= 0.05:
        sentiment = "positive"
    elif sentiment_score <= -0.05:
        sentiment = "negative"
    else:
        sentiment = "neutral"
    
    return sentiment

# Example usage
tweet = "I love this new product!"
sentiment = analyze_sentiment(tweet)
print(sentiment)

positive


# Extracting data from twitter

In [5]:
"""
Twitter Authentification Credentials
Please update with your own credentials
"""
cons_key = ''
cons_secret = ''
acc_token = ''
acc_secret = ''

# (1). Authentication Function
def get_twitter_auth():
    """
    @return:
        - the authentication to Twitter
    """
    try:
        consumer_key = cons_key
        consumer_secret = cons_secret
        access_token = acc_token
        access_secret = acc_secret
        
    except KeyError:
        sys.stderr.write("Twitter Environment Variable not Set\n")
        sys.exit(1)
        
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    
    return auth

# (2). Client function to access the authentication API
def get_twitter_client():
    """
    @return:
        - the client to access the authentication API
    """
    auth = get_twitter_auth()
    client = tweepy.API(auth, wait_on_rate_limit=True)
    return client

# (3). Function creating final dataframe
def get_tweets_from_user(twitter_user_name, page_limit=16, count_tweet=200):
    """
    @params:
        - twitter_user_name: the twitter username of a user (company, etc.)
        - page_limit: the total number of pages (max=16)
        - count_tweet: maximum number to be retrieved from a page
        
    @return
        - all the tweets from the user twitter_user_name
    """
    client = get_twitter_client()
    
    all_tweets = []
    
    for page in Cursor(client.user_timeline, 
                        screen_name=twitter_user_name, 
                        count=count_tweet).pages(page_limit):
        for tweet in page:
            parsed_tweet = {}
            parsed_tweet['date'] = tweet.created_at
            parsed_tweet['author'] = tweet.user.name
            parsed_tweet['twitter_name'] = tweet.user.screen_name
            parsed_tweet['text'] = tweet.text
            parsed_tweet['number_of_likes'] = tweet.favorite_count
            parsed_tweet['number_of_retweets'] = tweet.retweet_count
                
            all_tweets.append(parsed_tweet)
    
    # Create dataframe 
    df = pd.DataFrame(all_tweets)
    
    # Remove duplicates if there are any
    df = df.drop_duplicates("text", keep='first')
    
    return df

# Machine Learning for twitter sentiment analysis 

#### Load the dataset

In [6]:
import pandas as pd

# Load the dataset
df = pd.read_csv('twitter_dataset.csv')

#### Preprocessing the data 

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Preprocess the tweets
def preprocess_tweet(tweet):
    # Remove Twitter handles
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    
    # Remove special characters and punctuation
    tweet = re.sub(r'\W+', ' ', tweet)
    
    # Tokenization
    tokens = nltk.word_tokenize(tweet)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Join the tokens back into a single string
    processed_tweet = ' '.join(tokens)
    
    return processed_tweet

# Convert 'Tweet' column to string
df['Tweet'] = df['Tweet'].astype(str)

# Apply preprocessing to each tweet in the dataset
df['Processed_tweet'] = df['Tweet'].apply(preprocess_tweet)

# Print the preprocessed dataset
print(df['Processed_tweet'])

0                       yo look lit cs go overwatch combo
1       attention executive administrator ever store g...
2       f loving new dlc rhandlerr rhandlerr rhandlerr...
3                                       rainbow6game xbox
4                                  miss battlefield 1 day
                              ...                        
8239    thanks entering grand summoners watch video se...
8240                              agree clearer excellent
8241    worst minute ever locked layup missed open jum...
8242    participating beta weekend absolute best snipi...
8243                first week perfection mtg ignore tags
Name: Processed_tweet, Length: 8244, dtype: object


#### Define X and y

In [8]:
X = df['Processed_tweet']
y = df['Sentiment']

#### Convert sentiment labels to numerical values

In [9]:
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = y.map(label_mapping)

#### Split the data into training and test sets

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6595,), (1649,), (6595,), (1649,))

#### Create a TF-IDF vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

#### Train the LinearSVC model

In [16]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train_vectorized, y_train)

#### Make predictions on the test set

In [17]:
y_pred = model.predict(X_test_vectorized)

#### Compute the confusion matrix and generate classification report

In [18]:
from sklearn.metrics import confusion_matrix, classification_report

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Martrix: ')
print(cm)

# Generate classification report 
target_names = ['Negative', 'Neutral', 'Positive']
classification_rep = classification_report(y_test, y_pred, target_names = target_names)
print('\nClassification Report: ')
print(classification_rep)

Confusion Martrix: 
[[498  60  83]
 [102 275  92]
 [ 87  75 377]]

Classification Report: 
              precision    recall  f1-score   support

    Negative       0.72      0.78      0.75       641
     Neutral       0.67      0.59      0.63       469
    Positive       0.68      0.70      0.69       539

    accuracy                           0.70      1649
   macro avg       0.69      0.69      0.69      1649
weighted avg       0.70      0.70      0.70      1649

