In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_validate
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder

In [2]:
apps_df = pd.read_csv('googleplaystore.csv',
                       names = ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
                                'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
                                'Android Ver'])

In [3]:
apps_df = apps_df[apps_df['Rating'].notna()]
apps_df['Reviews'] = apps_df['Reviews'].str.extract('(\d+)', expand=False)

In [4]:
features = ['Category', 'Type', 'Content Rating', 'Genres']

X = apps_df[features][1:]
y = apps_df['Rating'][1:]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
X_train.head()

Unnamed: 0,Category,Type,Content Rating,Genres
2746,SHOPPING,Free,Everyone,Shopping
8946,PHOTOGRAPHY,Free,Everyone,Photography
663,DATING,Free,Teen,Dating
3335,TOOLS,Free,Everyone,Tools
2691,SHOPPING,Free,Everyone,Shopping


In [7]:
y_train.head()

2746    4.3
8946    3.1
663     4.1
3335    4.2
2691    4.3
Name: Rating, dtype: object

In [8]:
# We will now build a preprocessing pipeline

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse= False))
])

preprocessor = ColumnTransformer([
    ('ohe', categorical_transformer, features)])
preprocessor.fit(apps_df);

In [9]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("lgbm", LGBMRegressor())
])
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ohe',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Category', 'Type',
                                                   'Content Rating',
                                                   'Genres'])])),
                ('lgbm', LGBMRegressor())])

In [10]:
model.score(X_train, y_train)

0.0559898600591312

In [11]:
model.score(X_test, y_test)

0.03204640103580347

In [12]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("lgbm", LGBMRegressor())
])
model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ohe',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Category', 'Type',
                                                   'Content Rating',
                                                   'Genres'])])),
                ('lgbm', LGBMRegressor())])

In [13]:
type(X_test)

pandas.core.frame.DataFrame

In [14]:
model.predict(X_test)

array([4.41793988, 4.21682324, 4.02936772, ..., 4.14257078, 4.21263657,
       4.23243509])

In [15]:
X_test.iloc[1]

Category            SPORTS
Type                  Free
Content Rating    Everyone
Genres              Sports
Name: 3063, dtype: object

In [18]:
with open('web_api/app_predictor.joblib', 'wb') as f:
    joblib.dump(model, f)
with open('web_application/app_predictor.joblib', 'wb') as f:
    joblib.dump(model, f)

FileNotFoundError: [Errno 2] No such file or directory: 'web_api/app_predictor.joblib'

In [20]:
def return_prediction(model, input_json):
    
    input_data = pd.DataFrame(input_json)
    prediction = model.predict(input_data)[0]
    
    return prediction

In [21]:
example_input_json = {
    'Category' : ['DATING'],
    'Type' : ['Free'],
    'Content Rating' : ['Everyone'],
    'Genres' : ['Dating']
}

In [22]:
return_prediction(model, example_input_json)

4.003780408787758

now let's go to the URL http://127.0.0.1:5000/.

In [24]:
!curl -d '{"Category":["DATING"],"Type":["Free"],"Content Rating":["Everyone"],"Genres":["Dating"]}' \
      -H "Content-Type: application/json" \
      -X POST http://localhost:5000/predict

4.003780408787758
