In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#loading the dataset
df=pd.read_csv('cleaned_translation_data_1.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,Country,Hour,Day,Weekday,IsReshare,Reach,RetweetCount,Likes,Klout,Sentiment,Lang,text,clean_text,translated_text,valid_words
0,0,Unknown,Albania,7,12,Friday,True,339.0,127.0,0.0,44.0,0.0,en,"""RT @AdrianRusso82: Our Innovation Lab is offi...",our innovation lab officially open click learn...,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c..."
1,1,Male,Albania,11,7,Thursday,False,87.0,0.0,0.0,22.0,0.0,en,Now Open AWS Asia Pacific (Seoul) Region via ...,now open aws asia pacific seoul region via are...,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']"
2,2,Male,Albania,6,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,A Beginner's Guide to Scaling to 11 Million+ U...,a beginners guide scaling million users amazon...,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']"
3,3,Male,Albania,10,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,Bridging AWS and Azure environments via VPN vi...,bridging aws azure environments via vpn via ar...,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']"
4,4,Male,Albania,9,21,Thursday,False,85.0,0.0,0.0,21.0,0.0,en,ELK on AWS ElasticSearch Service + ElasticBean...,elk aws elasticsearch service elasticbeanstalk...,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']"


In [4]:
# dropping columns
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,Gender,Country,Hour,Day,Weekday,IsReshare,Reach,RetweetCount,Likes,Klout,Sentiment,Lang,text,clean_text,translated_text,valid_words
0,Unknown,Albania,7,12,Friday,True,339.0,127.0,0.0,44.0,0.0,en,"""RT @AdrianRusso82: Our Innovation Lab is offi...",our innovation lab officially open click learn...,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c..."
1,Male,Albania,11,7,Thursday,False,87.0,0.0,0.0,22.0,0.0,en,Now Open AWS Asia Pacific (Seoul) Region via ...,now open aws asia pacific seoul region via are...,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']"
2,Male,Albania,6,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,A Beginner's Guide to Scaling to 11 Million+ U...,a beginners guide scaling million users amazon...,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']"
3,Male,Albania,10,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,Bridging AWS and Azure environments via VPN vi...,bridging aws azure environments via vpn via ar...,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']"
4,Male,Albania,9,21,Thursday,False,85.0,0.0,0.0,21.0,0.0,en,ELK on AWS ElasticSearch Service + ElasticBean...,elk aws elasticsearch service elasticbeanstalk...,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']"


In [6]:
df['Lang'].value_counts()

Lang
en    93428
es     1812
fr     1097
_u      925
tr      899
ja      676
de      570
in      427
tl      421
ko      365
pt      239
nl      174
it      148
zh      124
th       81
pl       78
cs       74
ru       57
fi       51
et       42
ro       35
cy       29
ht       29
da       28
sv       28
ar       22
hu       20
no       13
lt        8
lv        6
hi        3
iw        2
sl        2
uk        2
eu        1
km        1
Name: count, dtype: int64

In [7]:
df.shape

(101917, 16)

#### At this point we are decided to drop the rows by using `Lang` column which are not translated in english as most of them are translated in it.

In [8]:
# List of substrings to remove
substrings_to_remove = ['_u','es', 'ht','in','it','ja','nl','pt','ro','tl','tr','zh']
# Create a regex pattern to match any of the substrings
pattern = '|'.join(substrings_to_remove)
# Remove rows where 'Lang' contains any of the substrings
df = df[~df['Lang'].str.contains(pattern)]

In [9]:
df.shape

(96008, 16)

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

# Function to tokenize the words, remove stop words, and treat normalization
def preprocess_text(text):
    # Convert non-string inputs to strings and handle missing values
    if not isinstance(text, str):
        text = str(text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Normalize words by using Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Apply preprocess_text function to 'clean_text' column
df['clean_text_processed'] = df['clean_text'].apply(lambda x: preprocess_text(x))

In [13]:
df.to_csv("cleaned_translation_data_2.csv")

## Data Engineering

In [14]:
# dropping columns
df.drop(['Gender','Country','Hour','Day','Weekday','IsReshare','Reach','RetweetCount','Likes','Klout','Lang','text','clean_text'],axis=1,inplace=True)

In `Sentiment` column, the values are not well defined. So, We are dividing them into 3 categories and then applying mapping.

In [15]:
# The values are not well defined. So, We are dividing them into 3 categories and then applying mapping.
df['Mood'] = df['Sentiment'].apply(lambda x: 
                                       ('Positive' if x > 0 
                                             else ('Neutral' if x == 0 
                                                   else  'Negative')))

In [22]:
# Rename the column 'valid_words' to 'tokens'
df.rename(columns={'valid_words': 'tokens'}, inplace=True)
df[['translated_text', 'tokens', 'Sentiment','Mood']].head(10)

Unnamed: 0,translated_text,tokens,Sentiment,Mood
0,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c...",0.0,Neutral
1,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']",0.0,Neutral
2,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']",0.0,Neutral
3,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']",0.0,Neutral
4,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']",0.0,Neutral
5,lessons years amazon web services via aresysadmin,"['web', 'via']",0.0,Neutral
6,how powerful burstable aws t instances via are...,"['powerful', 'via']",2.0,Positive
7,microsoft google lead cloud growth aws rules o...,"['lead', 'cloud', 'growth', 'overall', 'azure']",0.0,Neutral
8,how deploy ssl certificate aws certificate man...,"['deploy', 'certificate', 'certificate', 'mana...",0.0,Neutral
9,too bad facebook host parse a w s heroku mongo d,"['bad', 'host', 'parse', 'w']",-3.0,Negative


In [23]:
# Define a custom mapping for Mood to numerical values
mood_mapping = {
    'Neutral': 0,
    'Positive': 1,
    'Negative': 2
}

# Apply the custom mapping to create the 'Mood_Encoded' column
df['Mood'] = df['Mood'].map(mood_mapping)

In [24]:
df[['translated_text', 'tokens', 'Sentiment','Mood']].head(10)

Unnamed: 0,translated_text,tokens,Sentiment,Mood
0,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c...",0.0,0
1,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']",0.0,0
2,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']",0.0,0
3,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']",0.0,0
4,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']",0.0,0
5,lessons years amazon web services via aresysadmin,"['web', 'via']",0.0,0
6,how powerful burstable aws t instances via are...,"['powerful', 'via']",2.0,1
7,microsoft google lead cloud growth aws rules o...,"['lead', 'cloud', 'growth', 'overall', 'azure']",0.0,0
8,how deploy ssl certificate aws certificate man...,"['deploy', 'certificate', 'certificate', 'mana...",0.0,0
9,too bad facebook host parse a w s heroku mongo d,"['bad', 'host', 'parse', 'w']",-3.0,2


In [25]:
df.drop('Sentiment',axis=1,inplace=True)

In [26]:
df

Unnamed: 0,translated_text,tokens,clean_text_processed,Mood
0,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c...","[innovation, lab, officially, open, click, lea...",0
1,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']","[open, aws, asia, pacific, seoul, region, via,...",0
2,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']","[beginner, guide, scaling, million, user, amaz...",0
3,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']","[bridging, aws, azure, environment, via, vpn, ...",0
4,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']","[elk, aws, elasticsearch, service, elasticbean...",0
...,...,...,...,...
101912,does anyone good workflow using aws data scien...,"['anyone', 'good', 'data', 'science', 'spend',...","[anyone, good, workflow, using, aws, data, sci...",1
101913,aws csa associate professional training availa...,"['associate', 'professional', 'training', 'ava...","[aws, csa, associate, professional, training, ...",0
101914,curious change opinion aws efficient dropbox r...,"['curious', 'change', 'opinion', 'efficient', ...","[curious, change, opinion, aws, efficient, dro...",1
101915,new aws startup blog what startups should know...,"['new', 'know', 'choosing']","[new, aws, startup, blog, startup, know, choos...",0


In [28]:
df.shape

(96008, 4)

In [27]:
df.to_csv("cleaned_translation_data_3.csv",index=True)