In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/googleplaystore.csv')

In [None]:
# print("Dataset shape: ", df.shape)
# print("First 5 rows\n", df.head())
print("Column Names: ", df.columns.tolist())
# print("Info: ", df.info)

Column Names:  ['Sr.', 'App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver', 'Rating_normalized']


In [14]:
print("Missing values in dataset\n", df.isnull().sum())

# fixing missing values
# numerical values
df['Rating'].fillna(df['Rating'].median(), inplace=True)
print("After handling fixing numerical values\n", df.isnull().sum())

# categorical values
df['Type'].fillna('Unknown', inplace=True)
df['Content Rating'].fillna('Unknown', inplace=True)
df['Current Ver'].fillna('Unknown', inplace=True)
df['Android Ver'].fillna('Unknown', inplace=True)
print("After handling fixing categorical values\n", df.isnull().sum())

Missing values in dataset
 App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64
After handling fixing numerical values
 App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64
After handling fixing categorical values
 App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df['Rating'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [13]:
print("Data types:")
print(df.dtypes)

Data types:
App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object


In [16]:
print("Descriptive statistics:")
print(df.describe())

Descriptive statistics:
             Rating
count  10841.000000
mean       4.207841
std        0.500893
min        1.000000
25%        4.100000
50%        4.300000
75%        4.500000
max       19.000000


In [17]:
# Calculate the min and max of the column
data_min = df['Rating'].min()
data_max = df['Rating'].max()

# Apply the formula using Pandas
df['Rating_normalized'] = (df['Rating'] - data_min) / (data_max - data_min)

# Verify the result
print("Original 'Rating' range:", df['Rating'].min(), "-", df['Rating'].max())
print("Normalized 'Rating_normalized' range:", df['Rating_normalized'].min(), "-", df['Rating_normalized'].max())

Original 'Rating' range: 1.0 - 19.0
Normalized 'Rating_normalized' range: 0.0 - 1.0


In [None]:
df.to_csv('data/cleaned_googleplaystore.csv')

print("Data cleaning complete! Cleaned data saved to 'cleaned_dataset.csv'.")
print("Final shape:", df.shape)

Data cleaning complete! Cleaned data saved to 'cleaned_dataset.csv'.
Final shape: (10841, 14)
