In [7]:
# for data manipulation
import pandas as pd
import numpy as np

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.linear_model import BayesianRidge 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

# import model for regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [8]:
df = pd.read_csv('googleplaystore.csv')

# dropping 'Last Updated', 'Current Ver' and 'Android Ver' columns
df.drop(columns=['Last Updated', 'Current Ver', 'Android Ver', 'Content Rating'], inplace=True)

impute = IterativeImputer(estimator=XGBRegressor(), max_iter=100, random_state=42)
df['Rating'] = impute.fit_transform(df[['Rating']])

# drop the rows not ending with 'M' or 'k' or 'Varies with device'
size_pattern = r'(\d+M|\d+k|Varies with device)$'
df = df[df['Size'].str.match(size_pattern)]

def convert_size(convertings):
    if 'M' in convertings:
        return pd.to_numeric(convertings.replace('M', '')) * 1024
    elif 'k' in convertings:
        return pd.to_numeric(convertings.replace('k', ''))
    elif pd.isna(convertings):
        return np.nan
    elif 'Varies with device' in convertings:
        return np.nan
    else:
        return pd.to_numeric(convertings)
    
df['Size'] = df['Size'].apply(convert_size)

# TODO: impute the missing values in the 'Size' column

df['Size'] = df['Size'].fillna(df['Size'].mean())

def convert(convertings):
    if ',' in convertings:
        return convertings.replace(',', '')
    elif '+' in convertings:
        return convertings.replace('+', '')
    else:
        return pd.to_numeric(convertings)
    
df['Installs'] = df['Installs'].apply(convert)

# dealing with the 'Price' column
def convertings(convert):
    if convert == '0':
        return pd.to_numeric(convert.replace('0', '0'))
    elif '$' in convert:
        return pd.to_numeric(convert.replace('$', ''))

df['Price'] = df["Price"].apply(convertings)

# dealing with the 'Reviews' column
df['Reviews'] = df['Reviews'].apply(lambda x: pd.to_numeric(x.replace("'", '') if "'" in x else x))

# output df to csv
df.to_csv('googleplaystore_cleaned.csv', index=False)