In [60]:
import pandas as pd

df = pd.read_csv("clean_nus_sms.csv")

display(df.head())

df.info()


Unnamed: 0.1,Unnamed: 0,id,Message,length,country,Date
0,0,10120,Bugis oso near wat...,21,SG,2003/4
1,1,10121,"Go until jurong point, crazy.. Available only ...",111,SG,2003/4
2,2,10122,I dunno until when... Lets go learn pilates...,46,SG,2003/4
3,3,10123,Den only weekdays got special price... Haiz......,140,SG,2003/4
4,4,10124,Meet after lunch la...,22,SG,2003/4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48598 entries, 0 to 48597
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  48598 non-null  int64 
 1   id          48598 non-null  int64 
 2   Message     48595 non-null  object
 3   length      48598 non-null  object
 4   country     48598 non-null  object
 5   Date        48598 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.2+ MB


## Objective

Analyse what the most used words are in messages by different country

## Data cleaning on message variable

In [61]:
df['Message'] = df['Message'].astype(str)


## Data cleaning on country variable

In [62]:
print(df.country.value_counts())


country
Singapore              22013
SG                      9806
India                   6901
United States           3749
USA                     1932
Sri Lanka               1017
Malaysia                 766
Pakistan                 751
unknown                  602
Canada                   198
Bangladesh               126
China                    107
india                    105
INDIA                     79
Philippines               67
Indonesia                 48
Nepal                     39
srilanka                  30
United Kingdom            30
Hungary                   28
Serbia                    22
Kenya                     20
Ghana                     18
Italia                    10
Turkey                    10
Trinidad and Tobago       10
Lebanon                   10
Slovenia                  10
Nigeria                   10
New Zealand               10
Macedonia                 10
UK                        10
Morocco                    9
Romania                    9
Austra

### Conver2 2 letter codes to full words

In [63]:
from iso3166 import countries

def convert_country_codes(code):
    try:
        return countries.get(code).name
    except:
        return code

df['country'] = df['country'].apply(convert_country_codes)
print(df.country.value_counts())



country
Singapore                   31819
India                        7085
United States                3749
United States of America     1932
Sri Lanka                    1017
Malaysia                      767
Pakistan                      751
unknown                       602
Canada                        198
Bangladesh                    126
China                         107
Philippines                    67
Indonesia                      48
Nepal                          39
srilanka                       30
United Kingdom                 30
Hungary                        28
Serbia                         22
Kenya                          20
Ghana                          18
Turkey                         10
Trinidad and Tobago            10
Lebanon                        10
Italia                         10
Slovenia                       10
Nigeria                        10
New Zealand                    10
Macedonia                      10
UK                             10
Morocc

### Convert different capitalisations to be consistent. And merge USA values

In [64]:
df['country'] = df['country'].str.upper()
print(df.country.value_counts())

df['country'] = df['country'].replace('UNITED STATES OF AMERICA', 'UNITED STATES')
df['country'] = df['country'].replace('UK', 'UNITED KINGDOM')
df['country'] = df['country'].replace('SRILANKA', 'SRI LANKA')
print(df.country.value_counts())


country
SINGAPORE                   31819
INDIA                        7085
UNITED STATES                3749
UNITED STATES OF AMERICA     1932
SRI LANKA                    1017
MALAYSIA                      767
PAKISTAN                      751
UNKNOWN                       602
CANADA                        198
BANGLADESH                    126
CHINA                         107
PHILIPPINES                    67
INDONESIA                      48
NEPAL                          39
SRILANKA                       30
UNITED KINGDOM                 30
HUNGARY                        28
SERBIA                         22
KENYA                          20
GHANA                          18
TURKEY                         10
TRINIDAD AND TOBAGO            10
LEBANON                        10
ITALIA                         10
SLOVENIA                       10
NIGERIA                        10
NEW ZEALAND                    10
MACEDONIA                      10
UK                             10
MOROCC

## preprocessing file needed to make this work

In [65]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
  return normalized

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocess documents

In [66]:
# preprocess documents
df_country = df.groupby('country')['Message'].apply(' '.join).reset_index()

processed_messages = {}
for country in df_country['country']:
    processed_messages[country] = preprocess_text(df_country[df_country['country'] == country]['Message'].values[0])

## Create tf-idf scores for each country

In [67]:
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(list(processed_messages.values()))
feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_scores.toarray(), columns = feature_names, index=processed_messages.keys())


## Analyse top most used words in each country

In [68]:
country_list = ['SINGAPORE', 'UNITED STATES', 'INDIA', 'SRI LANKA']

top_words_df = pd.DataFrame()
for country in country_list:
    total_words = tfidf_df.loc[country].sum()
    country_word_df = tfidf_df.loc[country].div(total_words).sort_values(ascending=False).head(20)
    top_words_df[country] = country_word_df.index
display(top_words_df)


Unnamed: 0,SINGAPORE,UNITED STATES,INDIA,SRI LANKA
0,haha,you,be,the
1,to,be,to,be
2,be,to,you,bles
3,go,the,in,and
4,the,and,me,you
5,you,it,it,andreu
6,lol,get,the,to
7,can,that,for,may
8,so,do,and,in
9,get,can,ur,bless


## Analyse most unique words in each country

In [70]:
unique_words_df = pd.DataFrame()
for country in country_list:
    unique_word_df = tfidf_df.loc[country].div(tfidf_df.sum()).sort_values(ascending=False).head(20)
    unique_words_df[country] = unique_word_df.index
display(unique_words_df)

Unnamed: 0,SINGAPORE,UNITED STATES,INDIA,SRI LANKA
0,lixian,wahala,ff,inc
1,mocie,gucci,tuan,nalvalthukkal
2,moduleplanning,ross,jas,kurose
3,modify,rossknowing,jataya,kuruwita
4,modfass,gurren,ttoo,commin
5,modern,gumby,jatey,immeasurably
6,moderation,roughtimeline,jatin,garusinghe
7,modem,guitarin,jaun,kyrie
8,moboy,guessthey,jawab,dsdys
9,mng,guessin,jaya,namal
