## Refactoring

The class 'AppData' was created to collect the functions utilized for conducting data analysis. These functions were subsequently refactored to conform more closely to Python's coding conventions and best practices.

 - An object of the class 'AppData' is instanciated with its corresponding .csv file. The data contained in the file is available through the parameters AppData.header and AppData.dataset. 
 - The function 'remove_duplicates' was refactored. A python set 'already_added' is now responsible for keeping track of duplicated apps. 
 - The function 'is_english' was renamed 'is_non_ascii', since several languages use the same set of characters. Additionally, the function now uses python's 'generator'. 
 - The function 'remove_paid_apps' can now be described in only one line by using python's list comprehension. 
 - The functions 'freq_table' and 'display_table' were rendered more pythonic through the use of 'defaultdict' and lambda functions. 

In [141]:
from csv import reader 
from collections import defaultdict

class AppData():
    
    def __init__(self, csv_file, app_type='android'):
        with open(csv_file) as file:
            read_file = reader(file)
            data = list(read_file)
            self.header, *self.dataset = data
            self.type = app_type.lower()

            if self.type == 'android':
                self.name_index, self.reviews_index, self.price_index = 0, 3, 7
            else:
                self.name_index, self.reviews_index, self.price_index = 1, 5, 4
            
    def __repr__(self):
        return str(self.header)
    
    def number_apps(self):
        print(len(self.dataset))
    
    def explore_data(self, start, end, rows_and_columns=False):
        dataset_slice = self.dataset[start:end]
        
        for row in dataset_slice:
            print(row)
            print('\n')
        
        if rows_and_columns:
            print('Number of rows:', len(self.dataset))
            print('Number of columns:', len(self.dataset[0]))
        
    def remove_duplicates(self):
        reviews_max = {}

        for app in self.dataset:
            name = app[self.name_index]
            n_reviews = float(app[self.reviews_index])
            if name not in reviews_max:
                reviews_max[name] = n_reviews
            elif n_reviews > reviews_max[name]:
                reviews_max[name] = n_reviews

        clean = []
        already_added = set()

        for app in self.dataset:
            name = app[self.name_index]
            n_reviews = float(app[self.reviews_index])
            if n_reviews == reviews_max[name] and name not in already_added:
                clean.append(app)
                already_added.add(name)
        
        self.dataset = clean

        
    def is_non_ascii(self,string):
        num_non_ascii = sum(ord(c) > 127 for c in string)
        return num_non_ascii <= 3
    
    def remove_non_ascii(self):
        non_ascii = []
        
        for app in self.dataset:
            name = app[self.name_index]
            if self.is_non_ascii(name):
                non_ascii.append(app)
        
        self.dataset = non_ascii
    
    def remove_paid_apps(self):
        self.dataset = [app for app in self.dataset if app[self.price_index] == '0' or app[self.price_index]=='0.0']


    def freq_table(self, index):
        table = defaultdict(int)
        total = len(self.dataset)

        for row in self.dataset:
            value = row[index]
            table[value] += 1

        table_percentages = {}
        for key in table:
            percentage = (table[key] / total) * 100
            table_percentages[key] = percentage 

        return table_percentages


    def display_table(self, index):
        table = self.freq_table(index)
        table_sorted = sorted(table.items(), key=lambda x: x[1], reverse=True)
        for key, value in table_sorted:
            print(f"{key}: {value}")


            

### Ios data explore

In [142]:
ios = AppData('AppleStore.csv', app_type='ios')
print(ios)
ios.explore_data(0,3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


### Android data explore

In [143]:
android = AppData('googleplaystore.csv', app_type='android')
print(android)
android.explore_data(0,3,True)

del android.dataset[10472]

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


### Remove duplicates of android data

In [144]:
android.remove_duplicates()
android.number_apps()

9659


### Remove non-ascii apps 

In [145]:
android.remove_non_ascii()
ios.remove_non_ascii()

android.number_apps()
ios.number_apps()

9614
6183


### Remove paid apps from android and ios dataset

In [146]:
android.remove_paid_apps()
ios.remove_paid_apps()

android.number_apps()
ios.number_apps()

8864
3222


### Frequency tables


In [148]:
ios.display_table(-5)

Games: 58.16263190564867
Entertainment: 7.883302296710118
Photo & Video: 4.9658597144630665
Education: 3.662321539416512
Social Networking: 3.2898820608317814
Shopping: 2.60707635009311
Utilities: 2.5139664804469275
Sports: 2.1415270018621975
Music: 2.0484171322160147
Health & Fitness: 2.0173805090006205
Productivity: 1.7380509000620732
Lifestyle: 1.5828677839851024
News: 1.3345747982619491
Travel: 1.2414649286157666
Finance: 1.1173184357541899
Weather: 0.8690254500310366
Food & Drink: 0.8069522036002483
Reference: 0.5586592178770949
Business: 0.5276225946617008
Book: 0.4345127250155183
Navigation: 0.186219739292365
Medical: 0.186219739292365
Catalogs: 0.12414649286157665


In [150]:
android.display_table(-4)

Tools: 8.449909747292418
Entertainment: 6.069494584837545
Education: 5.347472924187725
Business: 4.591606498194946
Lifestyle: 3.892148014440433
Productivity: 3.892148014440433
Finance: 3.7003610108303246
Medical: 3.531137184115524
Sports: 3.463447653429603
Personalization: 3.3167870036101084
Communication: 3.2378158844765346
Action: 3.1024368231046933
Health & Fitness: 3.0798736462093865
Photography: 2.944494584837545
News & Magazines: 2.7978339350180503
Social: 2.6624548736462095
Travel & Local: 2.3240072202166067
Shopping: 2.2450361010830324
Books & Reference: 2.1435018050541514
Simulation: 2.0419675090252705
Dating: 1.861462093862816
Arcade: 1.8501805054151623
Video Players & Editors: 1.7712093862815883
Casual: 1.7599277978339352
Maps & Navigation: 1.3989169675090252
Food & Drink: 1.2409747292418771
Puzzle: 1.128158844765343
Racing: 0.9927797833935018
Libraries & Demo: 0.9363718411552346
Role Playing: 0.9363718411552346
Auto & Vehicles: 0.9250902527075812
Strategy: 0.913808664259927