<a href="https://colab.research.google.com/github/ben45123/AWS-Lambda-Research/blob/main/AppleDatasetAnalysis2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import patsy
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
# load your pre-cleaned file
df = pd.read_csv('cleanedAppleStore.csv')

# quick peek
print(df.shape)
display(df.head())
df.info()

(7197, 11)


Unnamed: 0,appName,price,userRatings,userRatingsCurrent,averageRating,averageRatingCurrent,version,contentRating,category,supportingDevices,supportedLanguages
0,PAC-MAN Premium,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,10
1,Evernote - stay organized,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,23
2,"WeatherBug - Local Weather, Radar, Maps, Alerts",0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,3
3,"eBay: Best App to Buy, Sell, Save! Online Shop...",0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,9
4,Bible,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,45


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7197 entries, 0 to 7196
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   appName               7197 non-null   object 
 1   price                 7197 non-null   float64
 2   userRatings           7197 non-null   int64  
 3   userRatingsCurrent    7197 non-null   int64  
 4   averageRating         7197 non-null   float64
 5   averageRatingCurrent  7197 non-null   float64
 6   version               7197 non-null   object 
 7   contentRating         7197 non-null   object 
 8   category              7197 non-null   object 
 9   supportingDevices     7197 non-null   int64  
 10  supportedLanguages    7197 non-null   int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 618.6+ KB


In [3]:
# log-transform your “userRatings” as the dependent variable
df['log_userRatings'] = np.log1p(df['userRatings'])

# formula: log_userRatings predicted by price, avg rating, device support, plus categories
formula = 'log_userRatings ~ price + averageRating + supportingDevices + C(category) + C(contentRating)'

# build X and y
y, X = patsy.dmatrices(formula, data=df, return_type='dataframe')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f"Train: {len(X_train)} rows; Test: {len(X_test)} rows")

Train: 5757 rows; Test: 1440 rows


In [6]:
model = sm.OLS(y_train, X_train).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        log_userRatings   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.458
Method:                 Least Squares   F-statistic:                     174.5
Date:                Sun, 27 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:53:58   Log-Likelihood:                -13292.
No. Observations:                5757   AIC:                         2.664e+04
Df Residuals:                    5728   BIC:                         2.684e+04
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [7]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

print(f"Test RMSE = {rmse:.3f}")
print(f"Test R²   = {test_r2:.3f}")

Test RMSE = 2.387
Test R²   = 0.472


In [8]:
# pick the handful you care about
key = ['price','averageRating',
       'C(category)[T.Social Networking]','C(contentRating)[T.17+]']
ci = model.conf_int().loc[key]
ci.columns = ['2.5%','97.5%']
print("95% CIs:\n", ci)

95% CIs:
                                       2.5%     97.5%
price                            -0.034617 -0.013778
averageRating                     1.405497  1.491544
C(category)[T.Social Networking]  1.977067  3.288661
C(contentRating)[T.17+]          -0.826338 -0.266563


In [9]:
# standardize numeric predictors
std_feats = ['price','averageRating','supportingDevices']
scaler = StandardScaler()
df_std = df.copy()
df_std[std_feats] = scaler.fit_transform(df_std[std_feats])

# rebuild & refit
y_s, X_s = patsy.dmatrices(formula, data=df_std, return_type='dataframe')
Xt, Xv, yt, yv = train_test_split(X_s, y_s, test_size=0.2, random_state=42)
model_std = sm.OLS(yt, Xt).fit()

print("Standardized βs:\n", model_std.params[std_feats + ['C(category)[T.Social Networking]']])

Standardized βs:
 price                              -0.141134
averageRating                       2.198626
supportingDevices                  -0.011910
C(category)[T.Social Networking]    2.632864
dtype: float64
