# Import Modules

In [1]:
import pandas as pd
import numpy as np
import emoji
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Import Data

In [2]:
df = pd.read_csv('tweets.csv')

df

  df = pd.read_csv('tweets.csv')


Unnamed: 0,user_name,text,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,hashtags,source
0,Bohmle,"#GPT4 for FREE. \nNo its not a clickbait, @Qol...",Carkingga,,,,,,,,,
1,,AI enthusiast,2019-07-03 03:44:41+00:00,60.0,349.0,611,False,2023-05-17 18:11:12+00:00,"['GPT4', 'ChatGPT4']",Twitter Web App,,
2,Dan Bruno AI,ChatGPT Thinks These 5 Crypto Coins Will Explo...,"Manchester, NH","The latest in #ChatGPT, #BARD, #Bing, and othe...",2021-05-19 01:19:32+00:00,470.0,157.0,5185,False,2023-05-17 18:11:03+00:00,"['chatgpt', 'AI', 'openAI']",dlvr.it
3,Georgiana Comsa,New: @JWVance's post about 5 #startups (includ...,Palo Alto,"Founder of Silicon Valley PR, award-winning PR...",2008-12-24 09:32:23+00:00,3864.0,1883.0,2415,False,2023-05-17 18:10:25+00:00,"['startups', 'startup50']",Twitter Web App
4,Bitone Great,🚨Get Out!🚨\n💰#Binance Spot💰\n⬇ Recommendation:...,Hong Kong,#ChatGPT (AI) powered Free Trading Signal! \nL...,2022-11-21 04:42:18+00:00,1517.0,506.0,64,False,2023-05-17 18:09:39+00:00,"['Binance', 'Short', 'GHSTUSDT']",rsi1
...,...,...,...,...,...,...,...,...,...,...,...,...
3412806,(I)(AM)(T)(MOYO),"Levels🙏🙏🙏,so happy for the chatGPT team for co...",Satoshi Island,Blockchain Enthusiast || Philanthropist || a S...,2013-12-07 16:38:30+00:00,3419.0,4994.0,16659,False,2022-12-05 17:10:31+00:00,,Twitter for Android
3412807,Green,Iterating back-and-forth with tools like #Chat...,🍁,,2022-12-03 12:49:13+00:00,3.0,33.0,3,False,2022-12-05 17:09:22+00:00,['ChatGPT'],Twitter Web App
3412808,Gabriel Furstenheim,Russel vs ChatGPT. It's also funny that it tak...,,Mathematician and Developer @Amazon. Previousl...,2016-07-09 21:08:52+00:00,80.0,34.0,169,False,2022-12-05 17:09:04+00:00,['ChatGPT'],Twitter for Android
3412809,Devang,Was just wondering is there any difference bet...,United States,"passionate by nature, software developer by pr...",2015-05-19 03:17:06+00:00,15.0,86.0,307,False,2022-12-05 17:08:44+00:00,"['ChatGPT', 'GPT3']",Twitter for Android


# Exploratory Data Analysis

In [3]:
print('Rows              :',df.shape[0])
print('Columns           :',df.shape[1])
print('\nFeatures        :\n',df.columns)
print('\nMissing values  :',df.isna().sum().values.sum())
print('\nUnique values   :',df.nunique())

Rows              : 3412811
Columns           : 12

Features        :
 Index(['user_name', 'text', 'user_location', 'user_description',
       'user_created', 'user_followers', 'user_friends', 'user_favourites',
       'user_verified', 'date', 'hashtags', 'source'],
      dtype='object')

Missing values  : 32106521

Unique values   : user_name           183646
text                474943
user_location        41866
user_description    187179
user_created        185757
user_followers       43556
user_friends         18652
user_favourites      51271
user_verified           62
date                459640
hashtags            110543
source                1230
dtype: int64


# Remove Duplicate Rows

In [4]:
def remove_duplicate_rows(df):
    duplicate_rows = df.duplicated()
    df = df[~duplicate_rows]
    df = df.reset_index(drop=True)
    
    return df

# Remove Missing Values

In [5]:
def remove_missing_values(df):
    df = df.dropna()
    
    return df

# Normalize/Clean the Text

In [6]:
def normalize_text(text):
    for i in range(len(text)):
        text[i] = text[i].lower()              #lower case all text
        translator = str.maketrans("", "", string.punctuation)      #get punctuations
        text[i] = text[i].translate(translator)                #remove punctuations'
        text[i] = emoji.demojize(text[i])                     #convert emojis to text
    
    return text

# Stop Word Removal

In [7]:
def remove_stopwords(text):
    for i in range(len(text)):
        stop_words = set(stopwords.words('english'))
        words = text[i].split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        text[i] = ' '.join(filtered_words)
        
    return text

# Stemming

In [8]:
def stemming(text):
    stemmer = PorterStemmer()
    
    for i in range(len(text)):
        words = word_tokenize(text[i])
        stemmed_words = [stemmer.stem(word) for word in words]
        text[i] = ' '.join(stemmed_words)
        
    return text

# Remove Links

In [10]:
def remove_links(text):
    for i in range(len(text)):
        text[i] = re.sub(r'http\S+', '', text[i])
        text[i] = re.sub(r'www\S+', '', text[i])
        text[i] = re.sub(r'[^\w\s]', '', text[i])
    
    return text

# Remove Special Characters

In [11]:
def remove_special_characters(text):
    for i in range(len(text)):
        text[i] = re.sub('[^a-zA-z0-9\s]', '', text[i])
    
    return text

# N Gram Analysis

In [12]:
def n_grams(text):
    ngrams_list = []
    
    for i in text:
        words = text.split(' ')
        for i in range(len(words) - n + 1):
            ngram = ' '.join(words[i:i+n])
            ngrams_list.append(ngram)
    
    return ngrams_list

def n_gram_count(ngram):
    counts = {}
    
    for i in ngram:
        counts[i] = ngram.count(i)
        
    return counts

# Word Cloud Generation

In [13]:
def wordcloud(text):
    words = []
    
    for i in text:
        words.append(i.split(' '))
        
    plt.figure(figsize=(16,13))
    wc=WordCloud(background_color='white',colormap='Set2',max_words=1000,max_font_size=200,width=1600,height=800)
    wc.generate(" ".join(words))
    plt.title('Most discussed terms',fontsize=20)
    plt.imshow(wc.recolor(colormap='Set2',random_state=17),alpha=0.98,interpolation='bilinear')
    plt.axis('off')
    plt.savefig('Word_Cloud_2020041.jpg')

# Sentiment Analysis Text Blob and RoBERTa

In [14]:
def textblob(text):
    polarities = []

    for i in text:
        blob = TextBlob(text)

        sentiment = blob.sentiment

        polarities.append(sentiment.polarity)    

    return polarities

def roberta(text):
    sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
    sentiments = []
    for i in text:
        sentiments.append(sentiment_analysis(i))

    return sentiments

In [15]:
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
print(sentiment_analysis('I love you'))

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Metal device set to: Apple M1


2023-05-18 15:54:11.326691: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-18 15:54:11.327157: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at siebert/sentiment-roberta-large-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.998561680316925}]


# Feature Engineering

In [16]:
# Calculate length of sentences

def get_sentence_length(text):
    sentence_length = {}
    
    for i in range(len(text)):
        sentence_length[str(i)] = len(text[i])
        
    return sentence_length      #returns length of sentences w.r.t letters

def word_count(text):
    word_counts = {}
    
    for i in range(len(text)):
        word_counts[str(i)] = len(text[i].split(' '))
        
    return word_counts

def space_count(text):
    space_counts = {}
    
    for i in range(len(text)):
        space_counts[str(i)] = text[i].count(' ')
        
    return space_counts

def verb_count(text):
    pos_tags_count = {}
    
    for i in range(len(text)):
        tokens = nltk.word_tokenize(text[i])
        pos_tags = nltk.pos_tag(tokens)
        
        pos_counts = Counter(tag for word, tag in pos_tags)
        
        pos_tags_count[str(i)] = pos_counts
        
    return pos_tags_count

def count_words_with_A_start(text):
    counts = {}
    
    for i in range(len(text)):
        words = text[i].split(' ')
        
        words_A = [word for word in words if word[0] == 'A']
        
        counts[str(i)] = len(words_A)
        
    return counts

# Vectorization

In [None]:
def count_vectorizer(text):
    count_vect = CountVectorizer()
    
    X = vectorizer.fit_transform(text)
    
    return X

def 