In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import fnmatch
import string
from urllib.parse import urlparse
import contractions

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import words, stopwords
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [16]:
tweets_df = pd.read_csv('/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/data/final_dataset.csv')
tweets_df.head()

Unnamed: 0,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions
0,2,2,pspatilsbi,Bangalore,325,25,2022-11-08 22:08:44+00:00,2704,0,agenda great old god blees end,[],"[{'screen_name': 'INCIndia', 'name': 'Congress..."
1,3,3,ththegde,"Kandivali East, Mumbai",582,57,2022-11-08 22:00:49+00:00,1969,0,please allow citizen buy forex investment like...,[],"[{'screen_name': 'PMOIndia', 'name': 'PMO Indi..."
2,5,5,rupz_boruah,"Chabua, India",14,33,2022-11-08 21:54:37+00:00,309,0,please take necessary action neurologist amc d...,"[{'text': 'Dr_Dhrubajyoti_Kurmi', 'indices': [...","[{'screen_name': 'MoHFW_INDIA', 'name': 'Minis..."
3,8,8,lazizpizza99,"Jasola Vihar, New Delhi",23,1,2022-11-08 21:28:48+00:00,6,0,sleeping suddenly bed start shaking ignored ke...,"[{'text': 'peace', 'indices': [231, 237]}, {'t...","[{'screen_name': 'LtGovDelhi', 'name': 'LG Del..."
4,11,11,UtkarshMishra_9,"Noida, India",707,1122,2022-11-08 21:14:55+00:00,5764,0,estimated magnitude earthquake affected countr...,"[{'text': 'earthquake', 'indices': [137, 148]}...","[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3..."


In [17]:
tweets_df.shape

(658, 12)

In [18]:
print('Shape of dataset before removal of duplicates is {}'.format(tweets_df.shape))
tweets_df.drop_duplicates(subset=['text'], inplace=True)
print('Shape of dataset after removal of duplicates is {}'.format(tweets_df.shape))

Shape of dataset before removal of duplicates is (658, 12)
Shape of dataset after removal of duplicates is (605, 12)


In [19]:
tweets_df.dtypes

user_id            int64
tweet_id           int64
username          object
location          object
following          int64
followers          int64
twt_created_at    object
total_tweets       int64
retweet_count      int64
text              object
hashtags          object
mentions          object
dtype: object

In [20]:
def data_preprocessing(df):
    """
    One function to rule them all, 
    one function to find them, 
    One function to bring them all, 
    and in the darkness bind them; 
    """
    print('Shape of dataset before removal of tweets with less than 5 words is {}'.format(df.shape))
    
    for tweets in df.loc[:,'text']:
        # count+=1
        # print(tweets)
        tokenizer = TweetTokenizer()
        tweet_id = df.loc[df['text'] == tweets, 'tweet_id'].values[0]
        # print(tweet_id)
        
        # removing links
        list_words = tokenizer.tokenize(tweets)
        x = [word for word in list_words if not urlparse(word).scheme]
        tweets = ' '.join(x)

        # contractions handling
        list_words = tokenizer.tokenize(tweets)
        new_list_words = []
        for word in list_words:
            new_list_words.append(contractions.fix(word))
        list_words = new_list_words
        del(new_list_words)
        tweets = ' '.join(list_words)
        
        # adding space between words and punctuations
        tweets = tweets.replace(',', ' ,').replace('.', ' .').replace('?', ' ?').replace('!', ' !')
        
        # removing hashtags and mentions
        list_words = tokenizer.tokenize(tweets)
        list_words = [word for word in list_words if word[0] not in ('#', '@')]
        list_words = [word for word in list_words if word[0] not in ('▪')]
        tweets = ' '.join(list_words)
        
        # removing punctuations
        tweets = tweets.translate(str.maketrans('', '', string.punctuation))
        
        #removing emojis
        tweets = re.sub(r'[^\x00-\x7F]+', ' ', tweets)
        
        #lower case
        tweets = tweets.lower()
        
        #remove numbers
        tweets = re.sub(r'\d+', '', tweets)
        tweets = re.sub(' +', ' ', tweets)
        
        #removing stopwords
        list_words = tokenizer.tokenize(tweets)
        filtered_words = [word for word in list_words if word not in stopwords.words('english')]
        tweets = ' '.join(filtered_words)
        del(filtered_words)
        
        #lemmatization
        lem = WordNetLemmatizer()
        list_words = tokenizer.tokenize(tweets)
        for word in list_words:
            list_words = list(map(lambda x: x.replace(word, lem.lemmatize(word)), list_words))
        tweets = ' '.join(list_words)
        
        #removing individual letters
        list_words = tokenizer.tokenize(tweets)
        filtered_words = [word for word in list_words if len(word)>2]
        tweets = ' '.join(filtered_words)
        del(filtered_words)
        
        # updating tweets in dataframe
        df.loc[df['tweet_id']==tweet_id, 'text'] = tweets
        
        #remove small tweets
        list_words = tokenizer.tokenize(tweets)
        if len(list_words) <= 5:
            ind_num = df[df['tweet_id']==tweet_id].index
            df.drop(ind_num, inplace=True)
        # break
    print('Shape of dataset after removal of tweets with less than 5 words is {}'.format(df.shape))
    
    return df
        
        
        

In [21]:
tweets_df = data_preprocessing(tweets_df)

Shape of dataset before removal of tweets with less than 5 words is (605, 12)
Shape of dataset after removal of tweets with less than 5 words is (605, 12)


In [22]:
# converting tweets_df into a csv file
filename = '/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/data/tweets_preprocessed_1.csv'
tweets_df.to_csv(filename, index=False)