In [3]:
import chardet
import pandas as pd

## Fixing the file embedding

In [3]:
file_path_1 = "user_reviews.csv"

In [7]:
def detect_file_encoding(file_path_1):
    with open(file_path_1, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        confidence = result['confidence']
        print(f"Detected Encoding: {encoding} (Confidence: {confidence * 100:.2f}%)")
        return encoding

In [9]:
detected_encoding = detect_file_encoding(file_path_1)

Detected Encoding: Windows-1254 (Confidence: 53.36%)


In [11]:
try_encodings = ['utf-8', 'latin1', 'cp1252', 'big5', 'gbk']

for encoding in try_encodings:
    try:
        with open(file_path_1, 'r', encoding=encoding) as f:
            data = f.read()
            print(f"Successfully decoded with {encoding}")
            break
    except UnicodeDecodeError as e:
        print(f"Failed with {encoding}: {e}")

Successfully decoded with utf-8


In [13]:
with open(file_path_1, 'rb') as f:
    raw_data = f.read()

decoded_data_1 = raw_data.decode('utf-8', errors='replace')

In [15]:
with open('fixed_file_1.txt', 'w', encoding='utf-8') as f:
    f.write(decoded_data_1)

In [5]:
user_review = pd.read_csv('fixed_file_1.txt')

In [19]:
file_path_2 = "apps.csv"

In [21]:
detected_encoding = detect_file_encoding(file_path_2)

Detected Encoding: Windows-1254 (Confidence: 51.48%)


In [22]:
try_encodings = ['utf-8', 'latin1', 'cp1252', 'big5', 'gbk']

for encoding in try_encodings:
    try:
        with open(file_path_2, 'r', encoding=encoding) as f:
            data = f.read()
            print(f"Successfully decoded with {encoding}")
            break
    except UnicodeDecodeError as e:
        print(f"Failed with {encoding}: {e}")

Successfully decoded with utf-8


In [25]:
with open(file_path_2, 'rb') as f:
    raw_data = f.read()

decoded_data_2 = raw_data.decode('utf-8', errors='replace')

In [27]:
with open('fixed_file_2.txt', 'w', encoding='utf-8') as f:
    f.write(decoded_data_2)

In [7]:
apps = pd.read_csv('fixed_file_2.txt')

## Data Preparation

In [10]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9659 entries, 0 to 9658
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   App              9659 non-null   object 
 1   Category_Proper  9659 non-null   object 
 2   Rating           8196 non-null   float64
 3   Reviews          9659 non-null   int64  
 4   Size             8432 non-null   float64
 5   Installs_num     9659 non-null   object 
 6   Installs         9659 non-null   object 
 7   Type             9659 non-null   object 
 8   Price ($)        9659 non-null   float64
 9   Content Rating   9659 non-null   object 
 10  Sub-genre        393 non-null    object 
 11  Genre            9659 non-null   object 
 12  Last Updated     9659 non-null   object 
 13  Current Ver      9651 non-null   object 
 14  Android Ver      9657 non-null   object 
 15  Unnamed: 15      0 non-null      float64
 16  Unnamed: 16      1 non-null      object 
dtypes: float64(4),

In [12]:
apps.drop(['Unnamed: 15', 'Unnamed: 16'], axis=1, inplace=True)

In [14]:
apps.rename(columns={'Category_Proper': 'Category', 'Installs_num':'Installs (Num)'}, inplace=True)

In [16]:
user_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37427 entries, 0 to 37426
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     37427 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37427 non-null  object 
 3   Sentiment_Polarity      37427 non-null  float64
 4   Sentiment_Subjectivity  37427 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.4+ MB


In [18]:
user_review.rename(columns={'Translated_Review':'Translated Review', 'Sentiment_Polarity':'Sentiment Polarity', 
                    'Sentiment_Subjectivity':'Sentiment Subjectivity'}, inplace=True)

### Fixing App Names

In [21]:
from googletrans import Translator
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

In [23]:
DetectorFactory.seed = 0

def get_non_english_app_names(df, column_name='App'):
    non_english_apps = []
    
    for app_name in df[column_name]:
        if pd.isnull(app_name):
            continue  
        try:
            if detect(app_name) != 'en':
                non_english_apps.append(app_name)
        except LangDetectException:
            print(f"Language detection failed for: {app_name}")
    
    return non_english_apps

In [25]:
translator = Translator()

In [27]:
def is_english(text):
    try:
        detected_lang = langdetect.detect(text)
        return detected_lang == 'en'
    except:
        return False

In [29]:
def translate_column(df, column_name, new_column_name):
    translations = []
    for text in df[column_name]:
        if is_english(text):
            translations.append(text)
        else:      
            try:
                translated_text = translator.translate(text, src='auto', dest='en').text
                translations.append(translated_text)
            except Exception as e:
                translations.append("Translation Error")  
    df[new_column_name] = pd.Series(translations)
    return df

##### 1. user_review table

In [32]:
series = user_review['App'].unique()

In [34]:
user_review_app_name = pd.DataFrame(series, columns=['App'])

In [36]:
non_english_apps = get_non_english_app_names(user_review_app_name, column_name='App')

In [37]:
non_english_apps = pd.DataFrame(non_english_apps, columns=['App'])

In [38]:
non_english_apps

Unnamed: 0,App
0,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室
1,11st
2,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...
3,591房屋交易-香港
4,8 Ball Pool
...,...
314,Homework Planner
315,Honkai Impact 3rd
316,Hostelworld: Hostels & Cheap Hotels Travel App
317,Hot Wheels: Race Off


In [42]:
non_english_apps_translated = translate_column(non_english_apps, column_name='App', new_column_name='Translated App Name')

In [43]:
non_english_apps_translated

Unnamed: 0,App,Translated App Name
0,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,104 Find a job -find a job to find a job to fi...
1,11st,11st
2,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,"591 Housing Trading-Rental House, Middle Ages,..."
3,591房屋交易-香港,591 House Trading-Hong Kong
4,8 Ball Pool,8 Ball Pool
...,...,...
314,Homework Planner,Homework Planner
315,Honkai Impact 3rd,Honkai Impact 3rd
316,Hostelworld: Hostels & Cheap Hotels Travel App,Hostelworld: Hostels & Cheap Hotels Travel App
317,Hot Wheels: Race Off,Hot Wheels: Race Off


##### 2. apps table

In [46]:
series_2 = apps['App'].unique()

In [48]:
app_name = pd.DataFrame(series_2, columns=['App'])

In [50]:
app_name

Unnamed: 0,App
0,"""i DT"" Fútbol. Todos Somos Técnicos."
1,.R
2,/u/app
3,[adult swim]
4,[BN] Blitz
...,...
9654,ZUL - Rotativo Digital BH
9655,zulily - Shop Daily Deals in Fashion and Home
9656,Zumper - Apartment Rental Finder
9657,Zynga Poker – Texas Holdem


In [52]:
non_english_apps = get_non_english_app_names(app_name, column_name='App')

Language detection failed for: BQ-መጽሐፍ ቅዱሳዊ ጥያቄዎች
Language detection failed for: DF@realtime
Language detection failed for: Myjob@BM
Language detection failed for: sustainability@BU


In [54]:
non_english_apps = pd.DataFrame(non_english_apps, columns=['App'])

In [56]:
non_english_apps

Unnamed: 0,App
0,"""i DT"" Fútbol. Todos Somos Técnicos."
1,.R
2,/u/app
3,[adult swim]
4,[BN] Blitz
...,...
4349,Zowi App
4350,ZUL - Rotativo Digital BH
4351,Zumper - Apartment Rental Finder
4352,Zynga Poker – Texas Holdem


In [58]:
non_english_apps_translated_2 = translate_column(non_english_apps, column_name='App', new_column_name='Translated App Name')

In [64]:
non_english_apps_translated_2 

Unnamed: 0,App,Translated App Name
0,"""i DT"" Fútbol. Todos Somos Técnicos.","""I DT"" Soccer.We are all technicians."
1,.R,.R
2,/u/app,/u/app
3,[adult swim],[adult swim]
4,[BN] Blitz,[BN] Blitz
...,...,...
4349,Zowi App,Zowi APP
4350,ZUL - Rotativo Digital BH,Zul - BH Digital Rotary
4351,Zumper - Apartment Rental Finder,Zumper - Apartment Rental Finder
4352,Zynga Poker – Texas Holdem,Zynga Poker – Texas Holdem


### Joining Translated App Name dfs with Main dfs

In [66]:
non_english_apps_translated

Unnamed: 0,App,Translated App Name
0,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,104 Find a job -find a job to find a job to fi...
1,11st,11st
2,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,"591 Housing Trading-Rental House, Middle Ages,..."
3,591房屋交易-香港,591 House Trading-Hong Kong
4,8 Ball Pool,8 Ball Pool
...,...,...
314,Homework Planner,Homework Planner
315,Honkai Impact 3rd,Honkai Impact 3rd
316,Hostelworld: Hostels & Cheap Hotels Travel App,Hostelworld: Hostels & Cheap Hotels Travel App
317,Hot Wheels: Race Off,Hot Wheels: Race Off


In [120]:
final_user_review = pd.merge(user_review, non_english_apps_translated, how='left', on='App')

In [70]:
non_english_apps_translated_2

Unnamed: 0,App,Translated App Name
0,"""i DT"" Fútbol. Todos Somos Técnicos.","""I DT"" Soccer.We are all technicians."
1,.R,.R
2,/u/app,/u/app
3,[adult swim],[adult swim]
4,[BN] Blitz,[BN] Blitz
...,...,...
4349,Zowi App,Zowi APP
4350,ZUL - Rotativo Digital BH,Zul - BH Digital Rotary
4351,Zumper - Apartment Rental Finder,Zumper - Apartment Rental Finder
4352,Zynga Poker – Texas Holdem,Zynga Poker – Texas Holdem


In [72]:
final_apps = pd.merge(apps, non_english_apps_translated_2, how='left', on='App')

In [74]:
final_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs (Num),Installs,Type,Price ($),Content Rating,Sub-genre,Genre,Last Updated,Current Ver,Android Ver,Translated App Name
0,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,,27,3.6,500,500+,Free,0.0,Everyone,,Sports,"October 7, 2017",0.22,4.1 and up,"""I DT"" Soccer.We are all technicians."
1,.R,Tools,4.5,259,0.2,10000,"10,000+",Free,0.0,Everyone,,Tools,"September 16, 2014",1.1.06,1.5 and up,.R
2,/u/app,Communication,4.7,573,53.0,10000,"10,000+",Free,0.0,Rated for 17+,,Communication,"July 3, 2018",4.2.4,4.1 and up,/u/app
3,[adult swim],Family,3.6,21433,21.0,1000000,"1,000,000+",Free,0.0,Rated for 17+,,Entertainment,"May 29, 2018",3.0.1805181047,5.0 and up,[adult swim]
4,[BN] Blitz,Sports,3.2,4,5.6,100,100+,Free,0.0,Everyone,,Sports,"May 17, 2018",4.6,4.3 and up,[BN] Blitz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9654,ZUL - Rotativo Digital BH,Auto and Vehicles,4.2,29,,10000,"10,000+",Free,0.0,Everyone,,Auto & Vehicles,"August 1, 2018",1.6.5,4.1 and up,Zul - BH Digital Rotary
9655,zulily - Shop Daily Deals in Fashion and Home,Shopping,4.5,28560,,1000000,"1,000,000+",Free,0.0,Everyone,,Shopping,"July 23, 2018",Varies with device,Varies with device,
9656,Zumper - Apartment Rental Finder,House and Home,4.4,11200,25.0,1000000,"1,000,000+",Free,0.0,Everyone,,House & Home,"July 16, 2018",4.5.15,5.0 and up,Zumper - Apartment Rental Finder
9657,Zynga Poker – Texas Holdem,Game,4.4,1986068,52.0,50000000,"50,000,000+",Free,0.0,Rated for 13+,,Casino,"July 25, 2018",21.54,4.1 and up,Zynga Poker – Texas Holdem


### Filling NA

In [77]:
final_apps['Translated App Name'] = final_apps['Translated App Name'].fillna(final_apps['App'])

In [79]:
final_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs (Num),Installs,Type,Price ($),Content Rating,Sub-genre,Genre,Last Updated,Current Ver,Android Ver,Translated App Name
0,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,,27,3.6,500,500+,Free,0.0,Everyone,,Sports,"October 7, 2017",0.22,4.1 and up,"""I DT"" Soccer.We are all technicians."
1,.R,Tools,4.5,259,0.2,10000,"10,000+",Free,0.0,Everyone,,Tools,"September 16, 2014",1.1.06,1.5 and up,.R
2,/u/app,Communication,4.7,573,53.0,10000,"10,000+",Free,0.0,Rated for 17+,,Communication,"July 3, 2018",4.2.4,4.1 and up,/u/app
3,[adult swim],Family,3.6,21433,21.0,1000000,"1,000,000+",Free,0.0,Rated for 17+,,Entertainment,"May 29, 2018",3.0.1805181047,5.0 and up,[adult swim]
4,[BN] Blitz,Sports,3.2,4,5.6,100,100+,Free,0.0,Everyone,,Sports,"May 17, 2018",4.6,4.3 and up,[BN] Blitz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9654,ZUL - Rotativo Digital BH,Auto and Vehicles,4.2,29,,10000,"10,000+",Free,0.0,Everyone,,Auto & Vehicles,"August 1, 2018",1.6.5,4.1 and up,Zul - BH Digital Rotary
9655,zulily - Shop Daily Deals in Fashion and Home,Shopping,4.5,28560,,1000000,"1,000,000+",Free,0.0,Everyone,,Shopping,"July 23, 2018",Varies with device,Varies with device,zulily - Shop Daily Deals in Fashion and Home
9656,Zumper - Apartment Rental Finder,House and Home,4.4,11200,25.0,1000000,"1,000,000+",Free,0.0,Everyone,,House & Home,"July 16, 2018",4.5.15,5.0 and up,Zumper - Apartment Rental Finder
9657,Zynga Poker – Texas Holdem,Game,4.4,1986068,52.0,50000000,"50,000,000+",Free,0.0,Rated for 13+,,Casino,"July 25, 2018",21.54,4.1 and up,Zynga Poker – Texas Holdem


In [122]:
final_user_review['Translated App Name'] = final_user_review['Translated App Name'].fillna(final_user_review['App'])

In [124]:
final_user_review

Unnamed: 0,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity,Translated App Name
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333,10 Best Foods for You
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462,10 Best Foods for You
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000,10 Best Foods for You
3,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000,10 Best Foods for You
4,10 Best Foods for You,Best way,Positive,1.000000,0.300000,10 Best Foods for You
...,...,...,...,...,...,...
37422,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,Housing-Real Estate & Property
37423,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222,Housing-Real Estate & Property
37424,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000,Housing-Real Estate & Property
37425,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000,Housing-Real Estate & Property


### Creating Unique IDs for Apps

In [86]:
final_apps['Unique ID'] = pd.factorize(final_apps['App'])[0]

In [90]:
final_apps = final_apps [['Unique ID', 'App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs (Num)', 'Installs', 'Type', 'Price ($)', 
                        'Content Rating', 'Sub-genre', 'Genre', 'Last Updated', 'Current Ver', 'Android Ver', 'Translated App Name']]

In [92]:
final_apps

Unnamed: 0,Unique ID,App,Category,Rating,Reviews,Size,Installs (Num),Installs,Type,Price ($),Content Rating,Sub-genre,Genre,Last Updated,Current Ver,Android Ver,Translated App Name
0,0,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,,27,3.6,500,500+,Free,0.0,Everyone,,Sports,"October 7, 2017",0.22,4.1 and up,"""I DT"" Soccer.We are all technicians."
1,1,.R,Tools,4.5,259,0.2,10000,"10,000+",Free,0.0,Everyone,,Tools,"September 16, 2014",1.1.06,1.5 and up,.R
2,2,/u/app,Communication,4.7,573,53.0,10000,"10,000+",Free,0.0,Rated for 17+,,Communication,"July 3, 2018",4.2.4,4.1 and up,/u/app
3,3,[adult swim],Family,3.6,21433,21.0,1000000,"1,000,000+",Free,0.0,Rated for 17+,,Entertainment,"May 29, 2018",3.0.1805181047,5.0 and up,[adult swim]
4,4,[BN] Blitz,Sports,3.2,4,5.6,100,100+,Free,0.0,Everyone,,Sports,"May 17, 2018",4.6,4.3 and up,[BN] Blitz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9654,9654,ZUL - Rotativo Digital BH,Auto and Vehicles,4.2,29,,10000,"10,000+",Free,0.0,Everyone,,Auto & Vehicles,"August 1, 2018",1.6.5,4.1 and up,Zul - BH Digital Rotary
9655,9655,zulily - Shop Daily Deals in Fashion and Home,Shopping,4.5,28560,,1000000,"1,000,000+",Free,0.0,Everyone,,Shopping,"July 23, 2018",Varies with device,Varies with device,zulily - Shop Daily Deals in Fashion and Home
9656,9656,Zumper - Apartment Rental Finder,House and Home,4.4,11200,25.0,1000000,"1,000,000+",Free,0.0,Everyone,,House & Home,"July 16, 2018",4.5.15,5.0 and up,Zumper - Apartment Rental Finder
9657,9657,Zynga Poker – Texas Holdem,Game,4.4,1986068,52.0,50000000,"50,000,000+",Free,0.0,Rated for 13+,,Casino,"July 25, 2018",21.54,4.1 and up,Zynga Poker – Texas Holdem


#### Storing data for further processing

In [149]:
final_user_review

Unnamed: 0,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity,Translated App Name
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333,10 Best Foods for You
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462,10 Best Foods for You
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000,10 Best Foods for You
3,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000,10 Best Foods for You
4,10 Best Foods for You,Best way,Positive,1.000000,0.300000,10 Best Foods for You
...,...,...,...,...,...,...
53934,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,Housing-Real Estate & Property
53935,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222,Housing-Real Estate & Property
53936,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000,Housing-Real Estate & Property
53937,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000,Housing-Real Estate & Property


In [168]:
final_user_review.to_csv('final_user_review.csv', index=False, encoding='utf-8')

In [170]:
final_apps.to_csv('final_apps.csv', index=False, encoding='utf-8')

### Final Merge

In [108]:
user_review = pd.read_csv('final_user_review.csv')

In [5]:
apps = pd.read_csv('final_apps.csv')

In [112]:
result = pd.merge(user_review, apps[['Unique ID', 'Translated App Name']], how='left', on='Translated App Name')

In [114]:
result.drop_duplicates(inplace=True)

In [115]:
unmatched_list = result[result['Unique ID'].isnull()]['App'].unique()

In [116]:
len(unmatched_list)

47

In [144]:
unmatched_list

array(['104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室',
       '591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價房貸查詢', '591房屋交易-香港',
       'ANA', 'AirBrush: Easy Photo Editor', 'Apple Daily 蘋果動新聞',
       'Aprender inglés con Wlingua', 'BaBe - Baca Berita',
       'BaBe Lite - Baca Berita Hemat Kuota', 'BaBe+ - Berita Indonesia',
       'Bagan - Myanmar Keyboard', 'Banco Itaú', 'Banco do Brasil',
       'Bangla Newspaper – Prothom Alo', 'Banque Populaire',
       'Birdays – Birthday reminder',
       'Brasileirão Pro 2018 - Série A e B', 'Buienradar - weer',
       'Bukalapak - Jual Beli Online', 'BukuBayi - Perkembangan Bayi',
       'Buscapé - Ofertas e descontos', 'BÁO MỚI - Đọc Báo, Tin Tức 24h',
       'CAIXA', 'Caf - Mon Compte', 'Claro',
       'ColorFil - Adult Coloring Book', 'Curso de Ingles Gratis',
       'Cut the Rope 2', 'DELISH KITCHEN - 無料レシピ動画で料理を楽しく・簡単に！',
       'Daum Mail - 다음 메일',
       'Delivery Club–Доставка еды:пицца,суши,бургер,салат',
       'Despegar.com Hoteles y Vuelos'

In [120]:
incomplete_user_review_df = result[result['Unique ID'].isnull()]

In [122]:
result = result.dropna(subset=['Unique ID'])

In [124]:
result['Unique ID'] = result['Unique ID'].astype(int)

In [126]:
incomplete_user_review_df.drop('Unique ID', axis=1, inplace=True)

In [132]:
incomplete_user_review_df.reset_index()

Unnamed: 0,index,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity,Translated App Name
0,194,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Great,Positive,0.800000,0.750000,104 Find a job -find a job to find a job to fi...
1,195,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,nice,Positive,0.600000,1.000000,104 Find a job -find a job to find a job to fi...
2,196,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Almost mobile phone,Neutral,0.000000,0.000000,104 Find a job -find a job to find a job to fi...
3,197,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,"Very effective, efficient convenient find avai...",Positive,0.626667,0.666667,104 Find a job -find a job to find a job to fi...
4,198,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Easy intuitive,Positive,0.433333,0.833333,104 Find a job -find a job to find a job to fi...
...,...,...,...,...,...,...,...
1365,1654702,Home Street – Home Design Game,There many things,Positive,0.500000,0.500000,Home Street – Home Design Game
1366,1654703,Home Street – Home Design Game,This game fun also little boring okay game u w...,Negative,-0.197917,0.500000,Home Street – Home Design Game
1367,1654704,Home Street – Home Design Game,It nice I would recommend would work without n...,Positive,0.311111,0.555556,Home Street – Home Design Game
1368,1654705,Home Street – Home Design Game,I anything say I love,Positive,0.500000,0.600000,Home Street – Home Design Game


In [134]:
incomplete_user_review_df

Unnamed: 0,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity,Translated App Name
194,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Great,Positive,0.800000,0.750000,104 Find a job -find a job to find a job to fi...
195,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,nice,Positive,0.600000,1.000000,104 Find a job -find a job to find a job to fi...
196,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Almost mobile phone,Neutral,0.000000,0.000000,104 Find a job -find a job to find a job to fi...
197,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,"Very effective, efficient convenient find avai...",Positive,0.626667,0.666667,104 Find a job -find a job to find a job to fi...
198,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Easy intuitive,Positive,0.433333,0.833333,104 Find a job -find a job to find a job to fi...
...,...,...,...,...,...,...
1654702,Home Street – Home Design Game,There many things,Positive,0.500000,0.500000,Home Street – Home Design Game
1654703,Home Street – Home Design Game,This game fun also little boring okay game u w...,Negative,-0.197917,0.500000,Home Street – Home Design Game
1654704,Home Street – Home Design Game,It nice I would recommend would work without n...,Positive,0.311111,0.555556,Home Street – Home Design Game
1654705,Home Street – Home Design Game,I anything say I love,Positive,0.500000,0.600000,Home Street – Home Design Game


In [140]:
incomplete_user_review_df.to_csv('incomplete_user_review.csv', index=False, encoding='utf-8')

In [136]:
apps_subset = apps[['Unique ID', 'App', 'Translated App Name']]

In [138]:
apps_subset 

Unnamed: 0,Unique ID,App,Translated App Name
0,0,"""i DT"" Fútbol. Todos Somos Técnicos.","""I DT"" Soccer.We are all technicians."
1,1,.R,.R
2,2,/u/app,/u/app
3,3,[adult swim],[adult swim]
4,4,[BN] Blitz,[BN] Blitz
...,...,...,...
9654,9654,ZUL - Rotativo Digital BH,Zul - BH Digital Rotary
9655,9655,zulily - Shop Daily Deals in Fashion and Home,zulily - Shop Daily Deals in Fashion and Home
9656,9656,Zumper - Apartment Rental Finder,Zumper - Apartment Rental Finder
9657,9657,Zynga Poker – Texas Holdem,Zynga Poker – Texas Holdem


In [142]:
apps_subset.to_csv('apps_subset.csv', index=False, encoding='utf-8')

## Dataset for Fuzzy Match in Power Query

In [151]:
sheet_name = "result_2"

In [155]:
result_2 = pd.read_excel("final.xlsx", sheet_name=sheet_name)

In [163]:
result_2.rename(columns={'apps_subset.Unique ID':'Unique ID'}, inplace=True)

In [165]:
result_2

Unnamed: 0,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity,Translated App Name,Unique ID
0,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Great,Positive,0.800000,0.750000,104 Find a job -find a job to find a job to fi...,24
1,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,nice,Positive,0.600000,1.000000,104 Find a job -find a job to find a job to fi...,24
2,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Almost mobile phone,Neutral,0.000000,0.000000,104 Find a job -find a job to find a job to fi...,24
3,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,"Very effective, efficient convenient find avai...",Positive,0.626667,0.666667,104 Find a job -find a job to find a job to fi...,24
4,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Easy intuitive,Positive,0.433333,0.833333,104 Find a job -find a job to find a job to fi...,24
...,...,...,...,...,...,...,...
855,Dr. Oetker Rezeptideen,Nice It nice great Ui.,Positive,0.666667,0.916667,Dr.Oetker recipe ideas,3376
856,Dr. Oetker Rezeptideen,Can I translate language english Give reply soon,Neutral,0.000000,0.000000,Dr.Oetker recipe ideas,3376
857,Dr. Oetker Rezeptideen,nice design good work guys,Positive,0.650000,0.800000,Dr.Oetker recipe ideas,3376
858,Dr. Oetker Rezeptideen,Easy recipes easy bake,Positive,0.433333,0.833333,Dr.Oetker recipe ideas,3376


In [149]:
result

Unnamed: 0,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity,Translated App Name,Unique ID
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333,10 Best Foods for You,16
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462,10 Best Foods for You,16
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000,10 Best Foods for You,16
3,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000,10 Best Foods for You,16
4,10 Best Foods for You,Best way,Positive,1.000000,0.300000,10 Best Foods for You,16
...,...,...,...,...,...,...,...
1655598,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,Housing-Real Estate & Property,5420
1655599,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222,Housing-Real Estate & Property,5420
1655600,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000,Housing-Real Estate & Property,5420
1655601,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000,Housing-Real Estate & Property,5420


In [167]:
df = pd.concat([result, result_2], ignore_index=True)

In [171]:
df.drop_duplicates(inplace=True)

In [175]:
df = df[['Unique ID', 'App', 'Translated Review', 'Sentiment', 'Sentiment Polarity', 'Sentiment Subjectivity']]

In [177]:
df

Unnamed: 0,Unique ID,App,Translated Review,Sentiment,Sentiment Polarity,Sentiment Subjectivity
0,16,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333
1,16,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462
2,16,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000
3,16,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000
4,16,10 Best Foods for You,Best way,Positive,1.000000,0.300000
...,...,...,...,...,...,...
41657,3376,Dr. Oetker Rezeptideen,Nice It nice great Ui.,Positive,0.666667,0.916667
41658,3376,Dr. Oetker Rezeptideen,Can I translate language english Give reply soon,Neutral,0.000000,0.000000
41659,3376,Dr. Oetker Rezeptideen,nice design good work guys,Positive,0.650000,0.800000
41660,3376,Dr. Oetker Rezeptideen,Easy recipes easy bake,Positive,0.433333,0.833333


In [179]:
df.to_csv('user_review_final_df.csv', index=False, encoding='utf-8')

In [9]:
apps.drop('Translated App Name', axis=1, inplace=True)

In [11]:
apps.to_csv('apps_final_df.csv', index=False, encoding='utf-8')