# Play Store Data Analysis
## Data Cleaning | Exploratory Data Analysis | Feature Engineering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/krishnaik06/playstore-Dataset/main/googleplaystore.csv")
df.head()

## 1. Initial Data Exploration

In [None]:
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")
print(f"\nDuplicate Rows: {df.duplicated().sum()}")

In [None]:
df.info()

## 2. Data Cleaning

### 2.1 Remove Duplicates

In [None]:
print(f"Before: {df.shape[0]} rows")
df.drop_duplicates(inplace=True)
print(f"After: {df.shape[0]} rows")

### 2.2 Remove Corrupted Rows

In [None]:
# Find rows where 'Category' has invalid values
corrupted_rows = df[~df['Category'].str.match(r'^[A-Z_]+$', na=False)]
print(f"Corrupted rows found: {len(corrupted_rows)}")
if len(corrupted_rows) > 0:
    print(corrupted_rows)
    df = df[df['Category'].str.match(r'^[A-Z_]+$', na=False)]
    print(f"Rows after cleanup: {df.shape[0]}")

### 2.3 Clean Rating Column

In [None]:
# Convert to numeric and handle NaN
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
print(f"Missing ratings: {df['Rating'].isnull().sum()}")

# Fill missing ratings with median
median_rating = df['Rating'].median()
df['Rating'] = df['Rating'].fillna(median_rating)
print(f"Filled missing with median: {median_rating}")

### 2.4 Clean Reviews Column

In [None]:
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df['Reviews'] = df['Reviews'].fillna(0)
df['Reviews'] = df['Reviews'].astype(int)
print(f"Reviews dtype: {df['Reviews'].dtype}")

### 2.5 Clean Size Column

In [None]:
def clean_size(size):
    """Convert size to MB (float)"""
    if pd.isna(size) or size == 'Varies with device':
        return np.nan
    if 'M' in str(size):
        return float(size.replace('M', ''))
    if 'k' in str(size):
        return float(size.replace('k', '')) / 1024
    return np.nan

df['Size_MB'] = df['Size'].apply(clean_size)
print(f"Missing sizes: {df['Size_MB'].isnull().sum()}")

# Fill missing with median
median_size = df['Size_MB'].median()
df['Size_MB'] = df['Size_MB'].fillna(median_size)
print(f"Filled with median: {median_size:.2f} MB")

### 2.6 Clean Installs Column

In [None]:
def clean_installs(installs):
    """Remove + and , from install count"""
    if pd.isna(installs):
        return np.nan
    return int(str(installs).replace(',', '').replace('+', ''))

df['Installs_Numeric'] = df['Installs'].apply(clean_installs)
print(f"Installs dtype: {df['Installs_Numeric'].dtype}")
print(f"Sample values: {df['Installs_Numeric'].head().tolist()}")

### 2.7 Clean Price Column

In [None]:
def clean_price(price):
    """Remove $ and convert to float"""
    if pd.isna(price):
        return 0.0
    price_str = str(price).replace('$', '').replace(',', '')
    try:
        return float(price_str)
    except:
        return 0.0

df['Price_USD'] = df['Price'].apply(clean_price)
print(f"Unique prices: {df['Price_USD'].nunique()}")
print(f"Price range: ${df['Price_USD'].min():.2f} - ${df['Price_USD'].max():.2f}")

### 2.8 Clean Last Updated Column

In [None]:
df['Last_Updated_Date'] = pd.to_datetime(df['Last Updated'], format='%B %d, %Y', errors='coerce')
print(f"Date range: {df['Last_Updated_Date'].min()} to {df['Last_Updated_Date'].max()}")

### 2.9 Clean Android Version Column

In [None]:
def extract_android_version(ver):
    """Extract minimum Android version as float"""
    if pd.isna(ver) or ver == 'Varies with device':
        return np.nan
    try:
        ver_str = str(ver).split()[0]
        parts = ver_str.split('.')
        if len(parts) >= 2:
            return float(f"{parts[0]}.{parts[1]}")
        return float(parts[0])
    except:
        return np.nan

df['Min_Android_Ver'] = df['Android Ver'].apply(extract_android_version)
print(f"Missing Android versions: {df['Min_Android_Ver'].isnull().sum()}")

# Fill with mode
mode_ver = df['Min_Android_Ver'].mode()[0]
df['Min_Android_Ver'] = df['Min_Android_Ver'].fillna(mode_ver)
print(f"Filled with mode: {mode_ver}")

### 2.10 Clean Type & Content Rating Columns

In [None]:
# Handle Type column
print(f"Missing Type values: {df['Type'].isnull().sum()}")
df['Type'] = df['Type'].fillna('Free')
print(f"Type distribution:\n{df['Type'].value_counts()}")

# Handle Content Rating
print(f"\nMissing Content Rating: {df['Content Rating'].isnull().sum()}")
df['Content Rating'] = df['Content Rating'].fillna('Everyone')
print(f"Content Rating distribution:\n{df['Content Rating'].value_counts()}")

### 2.11 Create Final Cleaned DataFrame

In [None]:
# Select relevant cleaned columns
df_cleaned = df[[
    'App', 'Category', 'Rating', 'Reviews', 'Size_MB', 
    'Installs_Numeric', 'Type', 'Price_USD', 'Content Rating',
    'Genres', 'Last_Updated_Date', 'Min_Android_Ver'
]].copy()

# Rename for clarity
df_cleaned.columns = [
    'App', 'Category', 'Rating', 'Reviews', 'Size_MB',
    'Installs', 'Type', 'Price_USD', 'Content_Rating',
    'Genres', 'Last_Updated', 'Min_Android_Ver'
]

print(f"Cleaned DataFrame Shape: {df_cleaned.shape}")
print(f"\nData Types:\n{df_cleaned.dtypes}")
print(f"\nMissing Values:\n{df_cleaned.isnull().sum()}")

In [None]:
df_cleaned.head(10)

### 2.12 Summary Statistics

In [None]:
df_cleaned.describe()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Set plot style
sns.set_style('whitegrid')
sns.set_palette('husl')

### 3.1 Rating Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df_cleaned['Rating'], bins=20, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of App Ratings')

# Box plot
axes[1].boxplot(df_cleaned['Rating'])
axes[1].set_ylabel('Rating')
axes[1].set_title('Rating Box Plot')

plt.tight_layout()
plt.show()

### 3.2 Category Distribution

In [None]:
plt.figure(figsize=(12, 8))
category_counts = df_cleaned['Category'].value_counts()
sns.barplot(x=category_counts.values, y=category_counts.index, palette='viridis')
plt.xlabel('Number of Apps')
plt.ylabel('Category')
plt.title('Apps by Category')
plt.tight_layout()
plt.show()

### 3.3 Free vs Paid Apps

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
type_counts = df_cleaned['Type'].value_counts()
axes[0].pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', 
            colors=['#66b3ff', '#ff9999'], explode=(0.05, 0))
axes[0].set_title('Free vs Paid Apps')

# Rating comparison
sns.boxplot(x='Type', y='Rating', data=df_cleaned, ax=axes[1], palette='Set2')
axes[1].set_title('Rating by App Type')

plt.tight_layout()
plt.show()

### 3.4 Top 10 Most Installed Apps

In [None]:
top_10 = df_cleaned.nlargest(10, 'Installs')[['App', 'Installs', 'Rating', 'Category']]
print("Top 10 Most Installed Apps:")
top_10

In [None]:
plt.figure(figsize=(12, 6))
top_10_plot = df_cleaned.nlargest(10, 'Installs')
sns.barplot(x='Installs', y='App', data=top_10_plot, palette='rocket')
plt.xlabel('Number of Installs')
plt.ylabel('App Name')
plt.title('Top 10 Most Installed Apps')
plt.tight_layout()
plt.show()

### 3.5 Content Rating Analysis

In [None]:
plt.figure(figsize=(10, 6))
content_counts = df_cleaned['Content_Rating'].value_counts()
sns.barplot(x=content_counts.index, y=content_counts.values, palette='coolwarm')
plt.xlabel('Content Rating')
plt.ylabel('Number of Apps')
plt.title('Apps by Content Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 3.6 Correlation Heatmap

In [None]:
# Select numeric columns for correlation
numeric_cols = df_cleaned[['Rating', 'Reviews', 'Size_MB', 'Installs', 'Price_USD', 'Min_Android_Ver']]

plt.figure(figsize=(10, 8))
correlation_matrix = numeric_cols.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', linewidths=0.5, square=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

### 3.7 Size vs Rating

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_cleaned['Size_MB'], df_cleaned['Rating'], alpha=0.3, c='purple')
plt.xlabel('Size (MB)')
plt.ylabel('Rating')
plt.title('App Size vs Rating')
plt.tight_layout()
plt.show()

### 3.8 Average Rating by Category

In [None]:
avg_rating = df_cleaned.groupby('Category')['Rating'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x=avg_rating.values, y=avg_rating.index, palette='Spectral')
plt.xlabel('Average Rating')
plt.ylabel('Category')
plt.title('Average Rating by Category')
plt.xlim(3.5, 4.5)
plt.tight_layout()
plt.show()

## 4. Save Cleaned Data

In [None]:
# Uncomment to save the cleaned dataset
# df_cleaned.to_csv('playstore_cleaned.csv', index=False)
# print("Cleaned data saved to 'playstore_cleaned.csv'")