In [1]:
import pandas as pd
import numpy as np
from googletrans import Translator
from langdetect import detect, detect_langs
import string, re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import MWETokenizer, word_tokenize
from nltk.tag import pos_tag
from nltk.stem.lancaster import LancasterStemmer

df = pd.read_csv('bumble_google_play_reviews.csv')

df.head()

[nltk_data] Downloading package punkt to /Users/bsameera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bsameera/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,gp:AOqpTOERZcLdCzs_J1Kf7wjtzeBsNdAbZQeQJMvCBdu...,Shailesh Pandey,https://play-lh.googleusercontent.com/a-/AOh14...,"After being a premium user, I'm not able to lo...",1,0,,2022-05-30 04:46:17,,
1,gp:AOqpTOH3iL1OJxK7ecSv2ZlYd2dyFnUPU65lKIeUNiA...,Arsh Entertainment,https://play-lh.googleusercontent.com/a-/AOh14...,superb,1,0,5.271.1,2022-05-30 04:27:59,,
2,gp:AOqpTOHTyaaVi1rehG_MgWocsfRs05MfB4Umdm3C2MT...,Dil,https://play-lh.googleusercontent.com/a-/AOh14...,"Fraudulent App, If you install a basic version...",1,0,5.271.1,2022-05-30 03:50:07,,
3,gp:AOqpTOGAK1FKDatr5sAKsuaq_KyZmqe8JowKF-odD6i...,Robert Whorton,https://play-lh.googleusercontent.com/a/AATXAJ...,"It's a lot better than Hinge, but it's still n...",3,0,5.270.1,2022-05-30 03:20:46,,
4,gp:AOqpTOEoJejC-2H4kzHOcCOfqJIewOvgdhEA9q9YPkT...,Optimum,https://play-lh.googleusercontent.com/a-/AOh14...,"good app, thanks dear women, you are beautiful",5,0,5.271.1,2022-05-30 03:12:53,Thank you for your review.\nWe have acknowledg...,2022-05-30 04:01:59


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110031 entries, 0 to 110030
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              110031 non-null  object
 1   userName              110031 non-null  object
 2   userImage             110031 non-null  object
 3   content               110026 non-null  object
 4   score                 110031 non-null  int64 
 5   thumbsUpCount         110031 non-null  int64 
 6   reviewCreatedVersion  92476 non-null   object
 7   at                    110031 non-null  object
 8   replyContent          64919 non-null   object
 9   repliedAt             64919 non-null   object
dtypes: int64(2), object(8)
memory usage: 8.4+ MB


In [3]:
df.content[0:10]

0    After being a premium user, I'm not able to lo...
1                                               superb
2    Fraudulent App, If you install a basic version...
3    It's a lot better than Hinge, but it's still n...
4       good app, thanks dear women, you are beautiful
5                                                   👍🌹
6    good app, nice to use , feels alot better usin...
7    I cancelled my subscription and deleted my acc...
8    HEY BUMBLE.. YOUR RATING WAS AT 2.8 JUST A FEW...
9    You get about 8 likes a day, if you have to de...
Name: content, dtype: object

In [4]:
# check the type for emoji content

print(df.content[5])
print(type(df.content[5]))

👍🌹
<class 'str'>


In [5]:
df.content.isnull().sum()

5

In [6]:
# drop only the rows where df.content==np.nan

df.dropna(subset=['content'], inplace=True)

In [7]:
df.content.isnull().sum()

0

In [8]:
# only five rows have been dropped from the total 110031

df.shape

(110026, 10)

In [9]:
# check if there are any numeric values

df[df['content'].astype(str).str.isnumeric()==True]

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
9335,gp:AOqpTOHwo8yuiamorlukgN1O9hCXBKT2heA8bdYxPbf...,Blake Piotter,https://play-lh.googleusercontent.com/a/AATXAJ...,0⁰,1,1,5.252.0,2022-02-15 23:38:46,,


In [10]:
print(df.loc[9335, 'content'])
print(type(df.loc[9335, 'content']))

0⁰
<class 'str'>


In [11]:
df = df[df.index!=9335]

In [12]:
df.shape

(110025, 10)

In [13]:
df[df['content'].astype(str).str.isnumeric()==True]

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt


In [14]:
# df["content"] = df["content"].map(lambda x: x.strip())

# error - AttributeError: 'float' object has no attribute 'strip'
# still some numeric values are present

In [15]:
df["content"] = df["content"].apply(lambda x: str(x))

# np.nan converts to str type, remove null values before applying the above method

In [16]:
df["content"] = df["content"].map(lambda x: x.strip())

df["content"] = df["content"].map(lambda x: x.lower())

# remove punctuation
df["content"] = df["content"].map(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))

# remove numbers
df["content"] = df["content"].map(lambda x: re.sub('[~^0-9]', '', x))


In [17]:
# check for blank values or spaces ' '
len(df[df.content==''])

0

In [18]:
len(df[df.content=='nan'])        

0

In [19]:
# create multiple columns

# create two new columns 'translated_reviews' - translated to english if non-english 
# and 'english' - 1 if original content was in english otherwise 0 - to know how many reviews were non-english originally


In [20]:
# translator = Translator()

# def create_columns(row):
#     english = 1
    
#     language = translator.detect(row['content'])
#     origin_language = language.lang
#     confidence = language.confidence
#     translated_text = row['content']               # if english, do not translate
    
#     if origin_language!='en' or confidence<1:
#         english = 0
#         result = translator.translate(row['content'], dest='en')
#         translated_text = result.text
                
#     return english, translated_text

In [21]:
# df['english'], df['translated_reviews'] = zip(*df.apply(create_columns, axis=1))

In [22]:
def create_column(row):
    
    try:
        if detect(row['content'])=='en':
            return 1
        else:
            return 0
    except:
        return 0

In [23]:
# df['english'] = df.apply(create_columns, axis=1)

# https://stackoverflow.com/questions/65279841/googletrans-throws-a-connecttimeout-error-the-handshake-operation-timed-out
# https://cloud.google.com/translate/docs/reference/libraries/v2/python


In [24]:
df['english'] = df.apply(create_column, axis=1)

In [25]:
df.head(10)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,english
0,gp:AOqpTOERZcLdCzs_J1Kf7wjtzeBsNdAbZQeQJMvCBdu...,Shailesh Pandey,https://play-lh.googleusercontent.com/a-/AOh14...,after being a premium user i m not able to lo...,1,0,,2022-05-30 04:46:17,,,1
1,gp:AOqpTOH3iL1OJxK7ecSv2ZlYd2dyFnUPU65lKIeUNiA...,Arsh Entertainment,https://play-lh.googleusercontent.com/a-/AOh14...,superb,1,0,5.271.1,2022-05-30 04:27:59,,,0
2,gp:AOqpTOHTyaaVi1rehG_MgWocsfRs05MfB4Umdm3C2MT...,Dil,https://play-lh.googleusercontent.com/a-/AOh14...,fraudulent app if you install a basic version...,1,0,5.271.1,2022-05-30 03:50:07,,,1
3,gp:AOqpTOGAK1FKDatr5sAKsuaq_KyZmqe8JowKF-odD6i...,Robert Whorton,https://play-lh.googleusercontent.com/a/AATXAJ...,it s a lot better than hinge but it s still n...,3,0,5.270.1,2022-05-30 03:20:46,,,1
4,gp:AOqpTOEoJejC-2H4kzHOcCOfqJIewOvgdhEA9q9YPkT...,Optimum,https://play-lh.googleusercontent.com/a-/AOh14...,good app thanks dear women you are beautiful,5,0,5.271.1,2022-05-30 03:12:53,Thank you for your review.\nWe have acknowledg...,2022-05-30 04:01:59,1
5,gp:AOqpTOETmHvZW1r4kz9_d3cyuMlB4mC6MmFNYqyEGDX...,Some One,https://play-lh.googleusercontent.com/a-/AOh14...,👍🌹,5,0,5.271.1,2022-05-30 02:58:52,,,0
6,gp:AOqpTOH_BjWzbX0ifMvfpxPezKTmJ7MYIVXL-E8r6B5...,Trent Drummond,https://play-lh.googleusercontent.com/a/AATXAJ...,good app nice to use feels alot better usin...,5,0,5.271.1,2022-05-30 02:42:02,,,1
7,gp:AOqpTOFvYXtWhExrIyb6PRqrioFEeOXWR74I74lz5QV...,William Alex Pleasant,https://play-lh.googleusercontent.com/a/AATXAJ...,i cancelled my subscription and deleted my acc...,1,0,5.267.0,2022-05-30 02:25:42,,,1
8,gp:AOqpTOHYNDnH4KhtozI9zC40xSt23RTbwcudYbfQHcn...,Pass Kall,https://play-lh.googleusercontent.com/a-/AOh14...,hey bumble your rating was at just a few m...,1,3,5.256.1,2022-05-30 02:13:43,Thank you for your review.\nWe have acknowledg...,2022-05-30 02:38:28,1
9,gp:AOqpTOE15-Bi0oeDfTTYFdqh3BI9AM1fVC-twkcP1Ia...,Chris Whitson,https://play-lh.googleusercontent.com/a-/AOh14...,you get about likes a day if you have to dea...,1,1,5.270.1,2022-05-30 01:22:11,,,1


In [26]:
df.english.value_counts()

1    89472
0    20553
Name: english, dtype: int64

In [27]:
df_english = df[df.english!=0]
df_english.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89472 entries, 0 to 110030
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              89472 non-null  object
 1   userName              89472 non-null  object
 2   userImage             89472 non-null  object
 3   content               89472 non-null  object
 4   score                 89472 non-null  int64 
 5   thumbsUpCount         89472 non-null  int64 
 6   reviewCreatedVersion  74697 non-null  object
 7   at                    89472 non-null  object
 8   replyContent          58213 non-null  object
 9   repliedAt             58213 non-null  object
 10  english               89472 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 8.2+ MB


In [28]:
df_english.reset_index(drop=True, inplace=True)

In [29]:
# save the df_english as a pickle -run only once as running again will replace the current one

# import pickle
# with open('pickles/df_english.pickle', 'wb') as to_write:
#     pickle.dump(df_english, to_write)

In [30]:
# whenever the jupyter notebook is opened, run from here
# read the pickle into a new dataframe
import pickle
with open('pickles/df_english.pickle','rb') as read_file:
    new_df_english = pickle.load(read_file)

In [31]:
new_df_english.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,english
0,gp:AOqpTOERZcLdCzs_J1Kf7wjtzeBsNdAbZQeQJMvCBdu...,Shailesh Pandey,https://play-lh.googleusercontent.com/a-/AOh14...,after being a premium user i m not able to lo...,1,0,,2022-05-30 04:46:17,,,1
1,gp:AOqpTOHTyaaVi1rehG_MgWocsfRs05MfB4Umdm3C2MT...,Dil,https://play-lh.googleusercontent.com/a-/AOh14...,fraudulent app if you install a basic version...,1,0,5.271.1,2022-05-30 03:50:07,,,1
2,gp:AOqpTOGAK1FKDatr5sAKsuaq_KyZmqe8JowKF-odD6i...,Robert Whorton,https://play-lh.googleusercontent.com/a/AATXAJ...,it s a lot better than hinge but it s still n...,3,0,5.270.1,2022-05-30 03:20:46,,,1
3,gp:AOqpTOEoJejC-2H4kzHOcCOfqJIewOvgdhEA9q9YPkT...,Optimum,https://play-lh.googleusercontent.com/a-/AOh14...,good app thanks dear women you are beautiful,5,0,5.271.1,2022-05-30 03:12:53,Thank you for your review.\nWe have acknowledg...,2022-05-30 04:01:59,1
4,gp:AOqpTOH_BjWzbX0ifMvfpxPezKTmJ7MYIVXL-E8r6B5...,Trent Drummond,https://play-lh.googleusercontent.com/a/AATXAJ...,good app nice to use feels alot better usin...,5,0,5.271.1,2022-05-30 02:42:02,,,1


In [32]:
df_non_english = df[df.english==0]

In [33]:
# save the df_non_english as a pickle-run only once as running again will replace the current one

# with open('pickles/df_non_english.pickle', 'wb') as to_write:
#     pickle.dump(df_non_english, to_write)

In [34]:
df_non_english.shape

(20553, 11)

In [35]:
# define a function to translate non-english sentences to english sentences in df_non_english dataframe, only if the 
# language.confidence is more than 0.9 and then append df_non_english to df_english, 
# before that delete the rows with non-english language
# df.append(df2, ignore_index=True)  ----  append dataframe to another dataframe
# df_english.reset_index(drop=True, inplace=True)  -------   reset index of the dataframe
# save the dataframe in a pickle 

df_non_english[0:10]

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,english
1,gp:AOqpTOH3iL1OJxK7ecSv2ZlYd2dyFnUPU65lKIeUNiA...,Arsh Entertainment,https://play-lh.googleusercontent.com/a-/AOh14...,superb,1,0,5.271.1,2022-05-30 04:27:59,,,0
5,gp:AOqpTOETmHvZW1r4kz9_d3cyuMlB4mC6MmFNYqyEGDX...,Some One,https://play-lh.googleusercontent.com/a-/AOh14...,👍🌹,5,0,5.271.1,2022-05-30 02:58:52,,,0
22,gp:AOqpTOE9qIFdB3dHT-ORuDhmHnbgnYCNRrgFxDxXdh4...,Allex Parish,https://play-lh.googleusercontent.com/a-/AOh14...,nyc app,4,0,5.271.1,2022-05-29 18:25:40,,,0
26,gp:AOqpTOHK4Z8ujq85qwKi4qMCCe4yBqwgKK-HvKlMwdA...,Jessie Maglinao,https://play-lh.googleusercontent.com/a-/AOh14...,nice,5,0,,2022-05-29 17:26:58,,,0
31,gp:AOqpTOEu77u81lv8-Q3iyfqLWd153K4_SO856SgJ3VL...,Kenji Kido,https://play-lh.googleusercontent.com/a/AATXAJ...,facil de usar y más confiable que otras,5,0,5.270.1,2022-05-29 16:24:44,,,0
35,gp:AOqpTOE-vSbY0dHAgI_j_X4w7Zw9VlUFiCle4QX-9LX...,George Triton,https://play-lh.googleusercontent.com/a-/AOh14...,awesome 👍,5,0,,2022-05-29 15:17:04,,,0
39,gp:AOqpTOHNc3jWSPToA8h9M8AWOtLoE81Bd3mPn0cEqRN...,Md Shahnawaz afsar,https://play-lh.googleusercontent.com/a/AATXAJ...,wow,5,0,5.271.1,2022-05-29 14:39:35,,,0
43,gp:AOqpTOEUxfZzGVV_mke7X17jhTlmsvktKbPdnZPFpjk...,SOYEL DAS,https://play-lh.googleusercontent.com/a-/AOh14...,not matching anybady,1,0,5.267.0,2022-05-29 12:24:57,,,0
46,gp:AOqpTOHi1Npjgb5uPrglFfkS2uBX-v5UE_JVDC8nook...,Zizou Zinedine,https://play-lh.googleusercontent.com/a/AATXAJ...,ok,4,0,5.271.1,2022-05-29 11:44:31,,,0
47,gp:AOqpTOGyEX0VS9vl80JHwBxblFDc-NxMSvxm3Q3kz2w...,MR sejwal gamer,https://play-lh.googleusercontent.com/a-/AOh14...,kuch nhi hai bass utho khana khao yha aake swi...,5,0,5.271.1,2022-05-29 10:49:36,,,0


In [36]:
detect('nyc app')
detect('nice')

'pl'

In [38]:
translator = Translator()

result = translator.translate('nyc app', dest='en')
language = translator.detect('nyc app')
print(language, result.text)

result = translator.translate('nice', dest='en')
language = translator.detect('nice')
print(language, result.text)

result = translator.translate('👍🌹', dest='en')
language = translator.detect('👍🌹')
print(language, result.text)

result = translator.translate('awesome 👍', dest='en')
language = translator.detect('awesome 👍')
print(language, result.text)

result = translator.translate('not matching anybady', dest='en')
language = translator.detect('not matching anybady')
print(language, result.text)

result = translator.translate(df_non_english.loc[47, 'content'], dest='en')
language = translator.detect(df_non_english.loc[47, 'content'])
print(language, result.text)

result = translator.translate('ok', dest='en')
language = translator.detect('ok')
print(language, result.text)


Detected(lang=en, confidence=0.816269) nyc app
Detected(lang=en, confidence=1) nice
Detected(lang=en, confidence=0) 👍🌹
Detected(lang=en, confidence=1) awesome 👍
Detected(lang=en, confidence=1) not matching anybody
Detected(lang=hi, confidence=1) kuch nhi hai bass utho khana khao yha aake swipe shuru karo phir sojao
Detected(lang=en, confidence=0.9121887) ok


In [None]:
def translate_to_english(row):
    
    translator = Translator()
    result = translator.translate(row['content'], dest='en')
    language = translator.detect(row['content'])

    if language.lang=='en' and language.confidence>0.9:
        row['content'] = result.text
        row['english'] = 1

In [None]:
# df_non_english = df_non_english.apply(translate_to_english, axis=1)

# ConnectTimeout: timed out ------  error

# try with spacy-langdetect

In [39]:
df_non_english.english.value_counts()

0    20553
Name: english, dtype: int64

In [40]:
new_df_english.shape

(89506, 11)

In [41]:
new_df_english.head(10)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,english
0,gp:AOqpTOERZcLdCzs_J1Kf7wjtzeBsNdAbZQeQJMvCBdu...,Shailesh Pandey,https://play-lh.googleusercontent.com/a-/AOh14...,after being a premium user i m not able to lo...,1,0,,2022-05-30 04:46:17,,,1
1,gp:AOqpTOHTyaaVi1rehG_MgWocsfRs05MfB4Umdm3C2MT...,Dil,https://play-lh.googleusercontent.com/a-/AOh14...,fraudulent app if you install a basic version...,1,0,5.271.1,2022-05-30 03:50:07,,,1
2,gp:AOqpTOGAK1FKDatr5sAKsuaq_KyZmqe8JowKF-odD6i...,Robert Whorton,https://play-lh.googleusercontent.com/a/AATXAJ...,it s a lot better than hinge but it s still n...,3,0,5.270.1,2022-05-30 03:20:46,,,1
3,gp:AOqpTOEoJejC-2H4kzHOcCOfqJIewOvgdhEA9q9YPkT...,Optimum,https://play-lh.googleusercontent.com/a-/AOh14...,good app thanks dear women you are beautiful,5,0,5.271.1,2022-05-30 03:12:53,Thank you for your review.\nWe have acknowledg...,2022-05-30 04:01:59,1
4,gp:AOqpTOH_BjWzbX0ifMvfpxPezKTmJ7MYIVXL-E8r6B5...,Trent Drummond,https://play-lh.googleusercontent.com/a/AATXAJ...,good app nice to use feels alot better usin...,5,0,5.271.1,2022-05-30 02:42:02,,,1
5,gp:AOqpTOFvYXtWhExrIyb6PRqrioFEeOXWR74I74lz5QV...,William Alex Pleasant,https://play-lh.googleusercontent.com/a/AATXAJ...,i cancelled my subscription and deleted my acc...,1,0,5.267.0,2022-05-30 02:25:42,,,1
6,gp:AOqpTOHYNDnH4KhtozI9zC40xSt23RTbwcudYbfQHcn...,Pass Kall,https://play-lh.googleusercontent.com/a-/AOh14...,hey bumble your rating was at just a few m...,1,3,5.256.1,2022-05-30 02:13:43,Thank you for your review.\nWe have acknowledg...,2022-05-30 02:38:28,1
7,gp:AOqpTOE15-Bi0oeDfTTYFdqh3BI9AM1fVC-twkcP1Ia...,Chris Whitson,https://play-lh.googleusercontent.com/a-/AOh14...,you get about likes a day if you have to dea...,1,1,5.270.1,2022-05-30 01:22:11,,,1
8,gp:AOqpTOHHpluX_q_pCIEAAqx28vheXLvi7rz7KQRioCx...,Rajat Pratap Singh,https://play-lh.googleusercontent.com/a-/AOh14...,not worth it just better than few other datin...,2,0,5.271.1,2022-05-30 01:02:10,,,1
9,gp:AOqpTOH7l7dai6uJ4HYg3HswUFwJE_1BIkWy11HLKtH...,Brandon Stirling,https://play-lh.googleusercontent.com/a/AATXAJ...,ill give a better rating when it doesn t ask i...,1,0,5.271.1,2022-05-30 00:47:51,,,1


In [42]:
print("HEY BUMBLE.. YOUR RATING WAS AT 2.8 JUST A FEW MONTHS AGO? YET SOMEHOW NOW IT'S 3.8. ITS PRETTY OBVIOUS THAT YOU ARE GIVING YOURSELF FAKE 5 AND 4 STAR REVIEWS. YOUR RATING WAS GOING DOWN FROM 3.0 TO 2.8 IN A MONTH OR SO. NOBODY BELIEVES YOUR APP HAS A RATING OF 3.8. I SEE 2 STAR REVIEWS ALL THE WAY DOWN 🤣🤣🤣🤣🤣")
print('\n', new_df_english.content[6])
sentence = new_df_english.content[6]
print(detect_langs(sentence), detect(sentence))


HEY BUMBLE.. YOUR RATING WAS AT 2.8 JUST A FEW MONTHS AGO? YET SOMEHOW NOW IT'S 3.8. ITS PRETTY OBVIOUS THAT YOU ARE GIVING YOURSELF FAKE 5 AND 4 STAR REVIEWS. YOUR RATING WAS GOING DOWN FROM 3.0 TO 2.8 IN A MONTH OR SO. NOBODY BELIEVES YOUR APP HAS A RATING OF 3.8. I SEE 2 STAR REVIEWS ALL THE WAY DOWN 🤣🤣🤣🤣🤣

 hey bumble   your rating was at   just a few months ago  yet somehow now it s    its pretty obvious that you are giving yourself fake  and  star reviews  your rating was going down from   to   in a month or so  nobody believes your app has a rating of    i see  star reviews all the way down 🤣🤣🤣🤣🤣
[en:0.9999964308923941] en


In [43]:
# custom stopword list

In [44]:
tv = TfidfVectorizer(stop_words = 'english', max_df=.95, min_df= 0.0001)
doc_term = tv.fit_transform(new_df_english.content)

doc_term.shape


(89506, 5406)

In [45]:
doc_term_df = pd.DataFrame(doc_term.toarray(), index=new_df_english.content, columns=tv.get_feature_names())
doc_term_df

Unnamed: 0_level_0,aap,ab,abandon,abandoned,abd,ability,able,abortion,abroad,abruptly,...,yr,yrs,yup,zero,zilch,zip,zodiac,zone,zoom,zoosk
content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
after being a premium user i m not able to login to my id pathetic experience no customer support is provided,0.0,0.0,0.0,0.0,0.0,0.0,0.26378,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fraudulent app if you install a basic version you will get many likes but it s blurred and you need to upgrade to premium plan to see your likes once you upgrade to a premium plan all the likes will vanish what a easy way to lure members to a premium plan,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
it s a lot better than hinge but it s still not that great reluctant to spend any money all of these dating apps are a bad value,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
good app thanks dear women you are beautiful,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
good app nice to use feels alot better using this than other apps around i wont mention any but it rhymes with binder,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
finally here,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
finally,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
finally an app where women have to start the conversation love the concept i feel more comfortable and less of a pest this way i did run out of people after minutes though,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
at last we have android version,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
print(type(doc_term_df.columns))
print(list(doc_term_df.columns[0:5]))

<class 'pandas.core.indexes.base.Index'>
['aap', 'ab', 'abandon', 'abandoned', 'abd']


In [48]:
words = tv.get_feature_names()
short_words = []
for word in words:
    if len(word)<4:
        short_words.append(word)
        
print(short_words)

['aap', 'ab', 'abd', 'acc', 'act', 'ad', 'add', 'ads', 'af', 'age', 'ago', 'ah', 'ai', 'ain', 'air', 'aka', 'amd', 'ans', 'ap', 'apo', 'app', 'aps', 'arm', 'art', 'asf', 'ask', 'ass', 'ate', 'atm', 'bad', 'bag', 'bam', 'ban', 'bar', 'bat', 'bay', 'bbb', 'bbf', 'bc', 'bcz', 'bed', 'bee', 'beg', 'bet', 'bf', 'bff', 'bhi', 'bi', 'big', 'bio', 'bit', 'biz', 'blm', 'boo', 'bot', 'bow', 'box', 'boy', 'bro', 'bs', 'bt', 'btw', 'bug', 'bum', 'bus', 'buy', 'bye', 'ca', 'cam', 'cap', 'car', 'cat', 'cc', 'ceo', 'cis', 'cmb', 'cos', 'cow', 'coz', 'cs', 'cue', 'cup', 'cut', 'cuz', 'da', 'dad', 'dam', 'day', 'dc', 'dec', 'def', 'dev', 'did', 'die', 'dig', 'dis', 'dm', 'dnt', 'dog', 'don', 'dry', 'duh', 'ea', 'eat', 'ect', 'ego', 'eh', 'ehh', 'ek', 'elo', 'em', 'end', 'esp', 'ex', 'exp', 'eye', 'fab', 'fan', 'faq', 'far', 'fat', 'fb', 'feb', 'fed', 'fee', 'ffs', 'fi', 'fir', 'fit', 'fix', 'fkn', 'fly', 'fo', 'ft', 'fu', 'fun', 'fwb', 'fyi', 'gal', 'gap', 'gay', 'gb', 'gf', 'gif', 'god', 'gor', 'got',

In [49]:
from collections import Counter

# reset index as I got the keyerror:1 - means the key doesnot exist
# df_english.reset_index(drop=True, inplace=True)

word_list = []

for i in range(89477):
    word_list.extend(new_df_english.content[i].split())
    
print(len(word_list))

2509544


In [50]:
counter = Counter(word_list)
print(len(counter))

27946


In [51]:
sorted_counter = sorted(counter.items(), reverse=True, key=lambda x: x[1])

sorted_counter

[('i', 86469),
 ('to', 81812),
 ('the', 78951),
 ('a', 56369),
 ('and', 55832),
 ('it', 52414),
 ('app', 47675),
 ('you', 40556),
 ('of', 33562),
 ('is', 31453),
 ('t', 31277),
 ('for', 30999),
 ('this', 26779),
 ('that', 23605),
 ('not', 23327),
 ('my', 22680),
 ('but', 22054),
 ('have', 21201),
 ('on', 21061),
 ('s', 19907),
 ('in', 19903),
 ('me', 17630),
 ('they', 17523),
 ('with', 16390),
 ('people', 14811),
 ('no', 13832),
 ('are', 13820),
 ('like', 13769),
 ('get', 13114),
 ('so', 13074),
 ('can', 12458),
 ('if', 12372),
 ('just', 12279),
 ('be', 12157),
 ('or', 11575),
 ('bumble', 11074),
 ('don', 10438),
 ('good', 10367),
 ('your', 10263),
 ('matches', 10231),
 ('time', 9538),
 ('was', 9224),
 ('even', 9211),
 ('all', 9101),
 ('as', 8875),
 ('there', 8756),
 ('one', 8338),
 ('when', 8126),
 ('use', 8120),
 ('women', 8088),
 ('dating', 8057),
 ('match', 7949),
 ('out', 7807),
 ('great', 7688),
 ('only', 7587),
 ('first', 7540),
 ('m', 7481),
 ('more', 7460),
 ('pay', 7447),
 ('

In [52]:
nmf = NMF(10, init = "nndsvda")
nmf.fit(doc_term)
nmf

NMF(init='nndsvda', n_components=10)

In [53]:
topic_term = nmf.components_.round(3)
topic_term.shape

(10, 5406)

In [54]:
topic_term_df = pd.DataFrame(topic_term.round(3),
                index = ["component_1", "component_2", "component_3", "component_4", "component_5", "component_6", "component_7", "component_8", "component_9", "component_10"],
                columns = tv.get_feature_names())
topic_term_df

Unnamed: 0,aap,ab,abandon,abandoned,abd,ability,able,abortion,abroad,abruptly,...,yr,yrs,yup,zero,zilch,zip,zodiac,zone,zoom,zoosk
component_1,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0
component_3,0.005,0.001,0.002,0.008,0.001,0.039,0.284,0.002,0.001,0.002,...,0.008,0.005,0.003,0.125,0.002,0.003,0.002,0.003,0.004,0.005
component_4,0.003,0.0,0.0,0.0,0.0,0.005,0.024,0.0,0.0,0.0,...,0.0,0.002,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.001
component_5,0.0,0.0,0.0,0.0,0.0,0.007,0.0,0.0,0.001,0.0,...,0.001,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.003,0.012
component_6,0.0,0.001,0.0,0.001,0.0,0.001,0.015,0.001,0.0,0.0,...,0.002,0.001,0.001,0.008,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0
component_8,0.019,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04,0.002,0.0,0.0,0.0,0.0,0.0
component_9,0.002,0.001,0.0,0.003,0.002,0.006,0.036,0.0,0.002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.003,0.002,0.0
component_10,0.005,0.0,0.0,0.0,0.0,0.005,0.0,0.0,0.0,0.0,...,0.002,0.0,0.002,0.003,0.0,0.001,0.0,0.003,0.001,0.0


In [55]:
# Function to display the top n terms in each topic
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

In [56]:
output = display_topics(nmf, tv.get_feature_names(), 30)
output;


Topic  1
good, pretty, far, experience, really, concept, app, idea, overall, luck, interface, site, looking, application, friends, job, thing, quality, girls, ladies, time, quite, lots, connections, wish, platform, think, meeting, dates, finding

Topic  2
great, app, concept, far, meet, way, women, experience, friends, idea, new, interface, meeting, girls, think, make, ladies, met, site, place, works, lots, connections, stuff, application, really, thanks, wish, guys, dates

Topic  3
matches, like, just, pay, match, don, women, bumble, message, account, make, got, profile, ve, profiles, swipe, want, money, fake, day, right, likes, free, subscription, premium, know, really, let, doesn, messages

Topic  4
easy, use, fun, facebook, simple, don, really, navigate, nice, super, interface, friendly, set, free, love, lots, sign, account, pretty, makes, features, user, works, concept, layout, understand, quick, filters, quality, intuitive

Topic  5
better, tinder, way, apps, quality, pof, like,

In [57]:
# change topic names, at the end
display_topics(nmf, tv.get_feature_names(), 10, ['bad_reviews', 'accessability', 'good_reviews', 'account', 'subscription', 'profiles', 'usage', 'comparing_to_tinder', 'fake_profiles', 'good_reviews']);
                                                         


Topic:  bad_reviews
good, pretty, far, experience, really, concept, app, idea, overall, luck

Topic:  accessability
great, app, concept, far, meet, way, women, experience, friends, idea

Topic:  good_reviews
matches, like, just, pay, match, don, women, bumble, message, account

Topic:  account
easy, use, fun, facebook, simple, don, really, navigate, nice, super

Topic:  subscription
better, tinder, way, apps, quality, pof, like, definitely, lot, worse

Topic:  profiles
app, love, worst, nice, awesome, using, sucks, open, bad, works

Topic:  usage
cool, pretty, really, met, concept, far, girls, like, fun, idea

Topic:  comparing_to_tinder
time, waste, money, don, fake, dont, complete, worst, total, worth

Topic:  fake_profiles
people, meet, nice, new, way, met, real, meeting, friends, fun

Topic:  good_reviews
dating, best, apps, far, site, bumble, ve, worst, used, online




In [58]:
doc_topic = nmf.transform(doc_term)
doc_topic.shape

(89506, 10)

In [60]:
doc_topic_df = pd.DataFrame(doc_topic.round(5), index = new_df_english.content, columns = ['bad_reviews', 'accessability', 'good_reviews', 'account', 'subscription', 'profiles', 'usage', 'comparing_to_tinder', 'fake_profiles', 'good_reviews'])
doc_topic_df

Unnamed: 0_level_0,bad_reviews,accessability,good_reviews,account,subscription,profiles,usage,comparing_to_tinder,fake_profiles,good_reviews
content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
after being a premium user i m not able to login to my id pathetic experience no customer support is provided,0.00117,0.00048,0.01170,0.00023,0.00015,0.00090,0.0000,0.00000,0.00000,0.00162
fraudulent app if you install a basic version you will get many likes but it s blurred and you need to upgrade to premium plan to see your likes once you upgrade to a premium plan all the likes will vanish what a easy way to lure members to a premium plan,0.00000,0.00013,0.01400,0.01472,0.00151,0.00760,0.0000,0.00000,0.00215,0.00000
it s a lot better than hinge but it s still not that great reluctant to spend any money all of these dating apps are a bad value,0.00000,0.03477,0.00444,0.00000,0.03067,0.00000,0.0000,0.00719,0.00000,0.03725
good app thanks dear women you are beautiful,0.03017,0.00097,0.00888,0.00000,0.00000,0.02264,0.0000,0.00000,0.00000,0.00085
good app nice to use feels alot better using this than other apps around i wont mention any but it rhymes with binder,0.02380,0.00000,0.00226,0.02781,0.03347,0.02135,0.0000,0.00000,0.01032,0.01028
...,...,...,...,...,...,...,...,...,...,...
finally here,0.00000,0.00000,0.00137,0.00000,0.00014,0.00009,0.0000,0.00000,0.00025,0.00042
finally,0.00000,0.00000,0.00137,0.00000,0.00014,0.00009,0.0000,0.00000,0.00025,0.00042
finally an app where women have to start the conversation love the concept i feel more comfortable and less of a pest this way i did run out of people after minutes though,0.00060,0.00147,0.01321,0.00000,0.00394,0.01872,0.0007,0.00000,0.03551,0.00000
at last we have android version,0.00000,0.00000,0.00338,0.00015,0.00058,0.00039,0.0000,0.00000,0.00000,0.00000
