In [1]:
#First import important libraries

#for data manipulation I am going to import pandas
import pandas as pd

#for natural language processing nltk
import nltk

#for working with strings
import string, re

#for dataset
import numpy as np
from nltk.corpus import stopwords, twitter_samples

In [2]:
# for loading and saving data
import pickle 

In [3]:
#for tweets tokenizing
from nltk.tokenize import TweetTokenizer 

In [4]:
def process_tweet(tweet):
    #First we will remove any unwanted character
    tweet=re.sub(r'http\S+|www\S+|@\w+|#\w+|^RT|^\$\w+', '', tweet)
    #Second we will tokenize the tweet using tokenizer we will keep all tweets with lower case by using preserve.case and remove usernames using strip_handles and reduce repeated letters in one word using reduce_len
    tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True, reduce_len=True)
    tweet_tokens=tokenizer.tokenize(tweet)
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english')
    #Remove stopwords
    stopwords_english = set(stopwords.words('english'))
    tweet_tokens = [word for word in tweet_tokens if word not in stopwords_english]
    
    #further cleaning for repeated verbs
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)
            
    return tweets_clean

In [5]:
def build_freqs(tweets, ys):
    """
    We are going to create a function build frequinceies using list of tweets and ys will be sentiment of each tweet 
    ys will be transferred into np list of positve and negative sentiment (either 0 or 1) to use in zip function
    zip function used to make 1-dimensional array (ys and tweets)
    then calling previous define sets word and tweets after cleaning to try it in the function
    pairing word and y also in 1- dimension array
    if the pair is found before if +=1 if it is not then it is a new entry if =1
    """
    yslist=np.squeeze(ys).tolist()
    freqs= {}
    for y,tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair=(word,y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [6]:
tweets=['I am happy','I am dissapointed','I am enjoying myself','I am depressed']
ys=[1,0,1,0]
result=build_freqs(tweets, ys)
result

{('happi', 1): 1, ('dissapoint', 0): 1, ('enjoy', 1): 1, ('depress', 0): 1}

In [7]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/4681ad3e-120f-4aec-9162-
[nltk_data]     3e3a66c158f5/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/4681ad3e-120f-4aec-9162-
[nltk_data]     3e3a66c158f5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [9]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

In [10]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [11]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [12]:
freqs = build_freqs(train_x, train_y)

In [13]:
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 10601


In [14]:
print('This is an example of a positive tweet: \n', train_x[22])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[22]))

This is an example of a positive tweet: 
 @gculloty87 Yeah I suppose she was lol! Chat in a bit just off out x :))

This is an example of the processed version of the tweet: 
 ['yeah', 'suppos', 'lol', 'chat', 'bit', 'x', ':)']


In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_x)
X_test = vectorizer.transform(test_x)

# Step 2: Transform the word counts into TF-IDF features
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_test_tfidf = tfidf_transformer.transform(X_test)

# Step 3: Create the Logistic Regression model
logistic_model = LogisticRegression()

# Step 4: Train the model on the training data
logistic_model.fit(X_train_tfidf, train_y.ravel())

# Step 5: Make predictions on the test data
predictions = logistic_model.predict(X_test_tfidf)

# Step 6: Evaluate the model's performance - Calculate accuracy
accuracy = accuracy_score(test_y, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.756


In [16]:
def predict_tweet(tweet, vectorizer, tfidf_transformer, logistic_model):
    """
    Input:
        tweet: a string
        vectorizer: the CountVectorizer used for text preprocessing
        tfidf_transformer: the TfidfTransformer used for TF-IDF feature transformation
        logistic_model: the trained logistic regression model
    Output:
        y_pred: the probability of a tweet being positive or negative
    """
    # Preprocess the tweet
    preprocessed_tweet = process_tweet(tweet)

    # Transform the preprocessed tweet into a numerical feature representation
    numerical_features = vectorizer.transform([' '.join(preprocessed_tweet)])  # Convert list to string
    tfidf_features = tfidf_transformer.transform(numerical_features)

    # Use the logistic regression model to predict the sentiment probability
    sentiment_probability = logistic_model.predict_proba(tfidf_features)[:, 1]

    return sentiment_probability[0]


In [17]:
def classify_sentiment(sentiment_probability):
    """
    Input:
        sentiment_probability: a single probability value (float) representing the sentiment probability of a tweet.
    Output:
        sentiment_label: a string representing the sentiment label ('Positive', 'Neutral', or 'Negative').
    """
    if sentiment_probability > 0.5:
        return 'Positive sentiment'
    elif sentiment_probability == 0.5:
        return 'Neutral sentiment'
    else:
        return 'Negative sentiment'


In [18]:
my_tweet = 'It is so hot today but it is the perfect day for a beach party'

# Step 1: Predict the sentiment probability using the trained model and vectorizer
sentiment_probability = predict_tweet(my_tweet, vectorizer, tfidf_transformer, logistic_model)

# Classify the sentiment based on the probability
sentiment_label = classify_sentiment(sentiment_probability)

# Check the sentiment label
print("Sentiment Label:", sentiment_label)

Sentiment Label: Positive sentiment
