<a href="https://colab.research.google.com/github/LambdaTheda/DS-Unit-2-Linear-Models/blob/master/pokemonThurMar12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, LabelBinarizer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report

In [0]:
df = pd.read_csv('300k.csv')
df_names = pd.read_csv('pokemonNumbers.csv')

In [0]:
df = df.merge(df_names, left_on='pokemonId', right_on='1', how='left')

In [0]:
# target = 'appearedDayOfWeek'
# categorical_features = ['class', 'rural', 'closeToWater', 'continent', 'urban', 'suburban', 'city', 'weather']
# numeric_features = ['pokestopDistanceKm', 'gymDistanceKm', 'temperature', 'windBearing', 'pressure', 'latitude', 'longitude']

target = 'city'
categorical_features = ['class', 'terrainType', 'appearedDay', 'weather']
numeric_features = ['population_density', 'windSpeed', 'temperature']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df[categorical_features + numeric_features], df[target])

In [0]:
def convert_to_numeric(X):
    try:
        return X.replace(r'([^\.\d])', pd.np.nan, regex=True).astype(float)
    except Exception as e:
        if X.dtype.name in ['float64', 'int64']:
            return X
        raise e

In [0]:
categorical_transformer = Pipeline(
    steps=(
        ('imputer', SimpleImputer(strategy='constant')),
        ('encode', OneHotEncoder(handle_unknown='ignore'))
    )
)

numeric_transformer = Pipeline(
    steps=(
        ('convert', FunctionTransformer(convert_to_numeric)),
        ('imputer', SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    )
)

preprocessor = ColumnTransformer(
    transformers=(
        ('categorical', categorical_transformer, categorical_features),
        ('numeric', numeric_transformer, numeric_features)
    )
)

In [0]:
rf_pipeline = Pipeline(
    steps=(
        ('preprocess', preprocessor),
        ('classify', RandomForestClassifier())
    )
)

lr_pipeline = Pipeline(
    steps=(
        ('preprocess', preprocessor),
        ('classify', LogisticRegression())
    )
)

In [11]:
rf_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=(('categorical',
                                                  Pipeline(memory=None,
                                                           steps=(('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                     

In [12]:
lr_pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=(('categorical',
                                                  Pipeline(memory=None,
                                                           steps=(('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                     

In [0]:
y_pred_rf = rf_pipeline.predict(X_test)

In [0]:
y_pred_lr = lr_pipeline.predict(X_test)

In [15]:
print(f"RF Accuracy score: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf))

RF Accuracy score: 0.994824016563147
              precision    recall  f1-score   support

      Berlin       1.00      1.00      1.00         5
   Bucharest       1.00      1.00      1.00         4
Buenos_Aires       1.00      1.00      1.00         6
     Chicago       0.98      0.99      0.99       197
      Denver       1.00      1.00      1.00        25
    Edmonton       1.00      1.00      1.00        40
     Halifax       1.00      1.00      1.00        11
Indianapolis       1.00      1.00      1.00         5
Kuala_Lumpur       1.00      1.00      1.00         1
     Kuching       1.00      1.00      1.00         4
   Ljubljana       1.00      1.00      1.00         2
 Los_Angeles       1.00      1.00      1.00        66
      Manila       1.00      1.00      1.00         1
   Melbourne       0.00      0.00      0.00         1
 Mexico_City       1.00      0.99      0.99        97
   Monterrey       1.00      1.00      1.00        22
    New_York       0.99      1.00      0.99 

  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(f"LR Accuracy score: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr))

LR Accuracy score: 0.8157349896480331
              precision    recall  f1-score   support

      Berlin       0.00      0.00      0.00         5
   Bucharest       1.00      1.00      1.00         4
Buenos_Aires       1.00      0.83      0.91         6
     Chicago       0.68      0.87      0.76       197
      Denver       0.92      0.96      0.94        25
    Edmonton       0.97      0.97      0.97        40
     Halifax       1.00      1.00      1.00        11
Indianapolis       0.00      0.00      0.00         5
Kuala_Lumpur       0.00      0.00      0.00         1
     Kuching       1.00      0.25      0.40         4
   Ljubljana       0.00      0.00      0.00         2
 Los_Angeles       0.88      0.53      0.66        66
      Manila       0.00      0.00      0.00         1
   Melbourne       0.00      0.00      0.00         1
 Mexico_City       0.97      0.80      0.88        97
   Monterrey       0.88      0.95      0.91        22
    New_York       0.74      0.77      0.75

  _warn_prf(average, modifier, msg_start, len(result))
