In [578]:
import pandas as pd
import numpy as np
df_reviews=pd.read_csv('googleplaystore_user_reviews.csv')
df_playstore=pd.read_csv('googleplaystore.csv')

In [579]:
def clean_text(df):

    df.columns = df.columns.str.lower()
    str_columns = df.select_dtypes(include='object').columns
    
    for col in str_columns:
        df[col] = df[col].str.lower().str.replace(' ', '_')
    
    return df
df_playstore=clean_text(df_playstore)

In [580]:
def replace_dates_with_nan(value):
    if isinstance(value, str):
        cleaned_value = value.replace('_', ' ').replace(',', '')
        try:
            parsed_date = pd.to_datetime(cleaned_value, format='%B %d %Y', errors='raise')
            return np.nan  
        except ValueError:
            return value
    return value

In [581]:
def convert_float(number, characters):
    try:
        for char in characters:
            number = number.replace(char, '')
        return float(number)
    except ValueError:
        return float('nan')

def convert_column_to_float(df, column_name, characters):
    df[column_name] = df[column_name].apply(lambda x: convert_float(x, characters))
    return df

df_playstore = convert_column_to_float(df_playstore, 'price', ['$'])
df_playstore=convert_column_to_float(df_playstore,'size',['m'])
df_playstore=convert_column_to_float(df_playstore,'installs',[',','+'])
df_playstore=convert_column_to_float(df_playstore,'reviews',[])

In [582]:
def isFloat(str):
    try:
        float(str)
        return True
    except ValueError:
        return False

In [583]:
def remove_digit(string):
    isinstance(5, int)
    if string.isdigit() or isFloat(string):
        string=''
    return string
df_playstore['category'] = df_playstore['category'].apply(remove_digit)
df_playstore['app']=df_playstore['app'].apply(remove_digit)
df_playstore['genres']=df_playstore['genres'].apply(remove_digit)

In [584]:
def normalize_and_encode_ratings(df,column_name):    
    rating_labels = {
        'unrated': 0,
        'everyone': 1,
        'everyone_10+': 2,
        'teen': 3,
        'mature_17+': 4,
        'adults_only_18+': 5
    }
    df['rating_label'] = df[column_name].map(rating_labels).fillna(-1)
    df['rating_label']=df['rating_label'].astype('int')
    return df
df_playstore=normalize_and_encode_ratings(df_playstore,'content rating')


In [585]:
def clean_date(date_str):
    try:
        cleaned_date = date_str.replace('_', ' ').replace(',', '')
        return pd.to_datetime(cleaned_date, format='%B %d %Y', errors='coerce')
    except Exception as e:
        print(f"Error processing date: {date_str} - {e}")
        return pd.NaT 

df_playstore['last updated'] = df_playstore['last updated'].apply(clean_date)

In [586]:
def replace_dates_with_null(df, column, months_list):
    months_list = [month.lower() for month in months_list]

    def contains_month(value):
        if isinstance(value, str):  
            value_lower = value.lower()
            for month in months_list:
                if month in value_lower:
                        return np.nan 
        return value  
                            
    df[column] = df[column].apply(contains_month)
                            
    return df
df_playstore=replace_dates_with_null(df_playstore,'genres',months_list = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"])

In [587]:
def map_type_in_place(df, type_col):

    type_mapping = {'free': 0, 'paid': 1}
    df[type_col] = df[type_col].map(type_mapping)

map_type_in_place(df_playstore, 'type')
df_playstore.drop(df_playstore[df_playstore['type'] == '0'].index, inplace=True)

In [588]:
def remove_duplicates(df):
    df= df.drop_duplicates()
    return df
df_playstore=remove_duplicates(df_playstore)

In [589]:
def create_app_id_column(df, app_name_col, new_id_col):
    unique_app_names = df[app_name_col].unique()
    app_id_mapping = {app_name: idx + 1 for idx, app_name in enumerate(unique_app_names)}
    df[new_id_col] = df[app_name_col].map(app_id_mapping)
    cols = [new_id_col] + [col for col in df.columns if col not in [new_id_col]]
    df = df[cols]
    return df
df_playstore=create_app_id_column(df_playstore,'app','app_id')

In [592]:
def encode_categorical(df, columns, encoding_type='label'):
    if encoding_type not in ['label', 'onehot']:
        raise ValueError("encoding_type must be 'label' or 'onehot'")
    
    df_encoded = df.copy()
    
    for column in columns:
        if column not in df_encoded.columns:
            raise ValueError(f"Column {column} not found in DataFrame")
        
        if encoding_type == 'label':
            df_encoded[column] = df_encoded[column].astype('category').cat.codes
        elif encoding_type == 'onehot':
            df_encoded = pd.get_dummies(df_encoded, columns=[column], drop_first=True)
    
    return df_encoded
df_encoded_label = encode_categorical(df_playstore, columns=['category'], encoding_type='onehot')

In [596]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def scale_numerical_features(df, columns, scaling_type='standardize'):

    if scaling_type not in ['standardize', 'normalize']:
        raise ValueError("scaling_type must be 'standardize' or 'normalize'")
    
    df_scaled = df.copy()
    
    if scaling_type == 'standardize':
        scaler = StandardScaler()
    elif scaling_type == 'normalize':
        scaler = MinMaxScaler()
    

    df_scaled[columns] = scaler.fit_transform(df_scaled[columns])
    
    return df_scaled
