<a href="https://colab.research.google.com/github/repoocsov/DS-Unit-2-Applied-Modeling/blob/master/module2-wrangle-ml-datasets/LS_DS_232_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pre-Model**

### Splitting some Cleaning

In [0]:
import pandas as pd
import numpy as np
colnames = ['site', 'vendor', 'timestamp', 'score', 'btc_amount', 'comment']
df = pd.read_csv('2020-01-13-kilos-6dnms-reviews.csv', error_bad_lines=False, lineterminator='\n', names=colnames, header=None, encoding='UTF-8')

In [2]:
df.shape

(235652, 6)

In [3]:
df.columns

Index(['site', 'vendor', 'timestamp', 'score', 'btc_amount', 'comment'], dtype='object')

In [4]:
df.tail(5)

Unnamed: 0,site,vendor,timestamp,score,btc_amount,comment
235647,Apollon,DrSommer,1578825660,1,0.001541,"Good stealth, good product"
235648,Apollon,DrSommer,1578827520,1,0.002712,very good
235649,Apollon,DrSommer,1578827760,1,0.001603,Excellent as usual
235650,Apollon,DrSommer,1578827820,1,0.001726,Excellent as usual
235651,Apollon,DrSommer,1578827820,1,0.002096,"excellent stealth, fast shipping, original bli..."


In [5]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# GETTING BTC PRICE ON THOSE DATES
''' btc price dataframe '''
df2 = pd.read_csv('btc.csv')
subset = df2[['PriceUSD', 'date']]
subset['date'] = pd.to_datetime(subset['date'], infer_datetime_format=True)


''' original dataframe '''

# ZEROES CURRENTLY DROPPED
#df['timestamp'].replace({0: 1528825660})
df['timestamp'][0] = np.nan
df['timestamp'] = pd.to_datetime(df['timestamp'], origin='unix', unit='s', utc=True)
df['timestamp'] = [x.replace(tzinfo=None).date() for x in df['timestamp']]
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = pd.merge(df, subset, how='inner', left_on='timestamp', right_on='date')
df

Unnamed: 0,site,vendor,timestamp,score,btc_amount,comment,PriceUSD,date
0,Empire,ofgrey,2018-02-19,1,0.002418,"Good quality product, everything is very well ...",11139.355730,2018-02-19
1,Empire,ofgrey,2018-02-19,1,0.000092,good best price thank you,11139.355730,2018-02-19
2,Empire,ofgrey,2018-02-19,1,0.002418,Best vendor !!!,11139.355730,2018-02-19
3,Empire,ofgrey,2018-02-19,1,0.000150,good thank you,11139.355730,2018-02-19
4,Empire,DrunkDragon,2018-03-02,-1,0.000113,"scam,scam scam,save urself the trouble.SCAM.",11014.058470,2018-03-02
...,...,...,...,...,...,...,...,...
225409,Apollon,DrSommer,2020-01-12,1,0.001541,"Good stealth, good product",8164.549993,2020-01-12
225410,Apollon,DrSommer,2020-01-12,1,0.002712,very good,8164.549993,2020-01-12
225411,Apollon,DrSommer,2020-01-12,1,0.001603,Excellent as usual,8164.549993,2020-01-12
225412,Apollon,DrSommer,2020-01-12,1,0.001726,Excellent as usual,8164.549993,2020-01-12


In [6]:
""" FEATURES """
# CATAGORICAL: site, vendor, comment
# NUMERIC: timestamp, btc_amount

""" TARGET """
# score


# Score represents the sentiment of the comment associated with the purchase. 1 being positive, 0 neutral, and -1 negative.
# In addition to the accuracy metric...‘balanced_accuracy’ as there is a heavy class imbalance.

# Distribution of the target
"""
 93.9% positive
 3.2% negative
 2.8% neutral

 This is a multi-class classification problem with three categories.
 The mean baseline is 93.9 %
"""
df['score'].value_counts(normalize=True)

 1    0.938526
-1    0.032079
 0    0.029395
Name: score, dtype: float64

In [7]:
df['timestamp'].describe()

count                  225414
unique                    654
top       2019-11-06 00:00:00
freq                     2505
first     2018-02-19 00:00:00
last      2020-01-12 00:00:00
Name: timestamp, dtype: object

In [0]:
# timestamp has missing values (zeros). It would be nice to split the data this way otherwise.
# Order of stratification importance...
# score, timestamp/site
from sklearn.model_selection import train_test_split

target = 'score'
features = ['site', 'vendor', 'btc_amount', 'comment', 'PriceUSD', 'date']

y_df = df[target]
X_df = df[features]

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42, stratify=y_df)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((180331, 6), (45083, 6), (180331,), (45083,))

In [10]:
# Repeating to get a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=49, stratify=y_train)

X_train.shape, X_val.shape, X_test.shape

((144264, 6), (36067, 6), (45083, 6))

### Wrangle

In [0]:
""" WRANGLE FUNCTION """

def wrangle(dataframe):

  # DATE
  dataframe['date'] = dataframe['date'].astype(int)

  # COMMENTS
  dataframe['comment'] = dataframe['comment'].replace(np.nan, '')

  # DOLLAR VALUE
  dataframe['price'] = dataframe['PriceUSD'] * dataframe['btc_amount']

  # RENAMING COLUMNS
  dataframe = dataframe.rename({'priceUSD': 'bitcoin_price'})

  # ORGANIZING COLUMNS
  dataframe = dataframe.reindex(columns=['site', 'vendor', 'date', 'comment', 'price', 'btc_amount'])

  return dataframe

In [12]:
wrangle(X_train)
wrangle(X_val)
wrangle(X_test)

Unnamed: 0,site,vendor,date,comment,price,btc_amount
7158,Cannazon,Briefsoven,1076.466594,"Came very quickly, excellent quality, great pr...",34.435427,0.006210
19599,Cannazon,radarbreeder,1076.884604,"all perfect, thanks",58.453416,0.007641
80494,Apollon,youngbanger,1077.876615,"Fucking good stuff,fucking great stealth shipp...",63.783312,0.006131
45774,Empire,DrunkDragon,1077.470443,750 000 codes 0 correct so I don't believe in ...,1.206703,0.000115
44418,Empire,SATAII,1077.450610,No feedback comment,68.810629,0.006841
...,...,...,...,...,...,...
109950,Empire,DrHappy,1078.143677,"Schnelle Lieferung, gute Ware \r\nGerne wieder",197.872553,0.023013
10944,Cannazon,Bllume,1076.645840,2DD fr den preis super Zeug! sehr freundlicher...,274.078230,0.037416
112814,Apollon,Germanapotheke,1078.163446,Danke perfekt,0.901041,0.000109
79125,Cryptonia,aKINDercare,1077.856820,recommend this vendor and product.,29.249261,0.002900


# **Building a Model**

In [0]:
%%capture
import sys
import numpy as np
import pandas as pd

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*

In [14]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier() 
)

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['site', 'vendor', 'comment'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'site',
                                          'data_type': dtype('O'),
                                          'mapping': Cryptonia    1
Empire       2
Apollon      3
CannaHome    4
Cannazon     5
NaN         -2
dtype: int64},
                                         {'col': 'vendor',
                                          'data_type': dtype('O'),
                                          'mapping': valhalla              1
PremierLeague...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
              

In [15]:
# ACCURACY SCORE (BASELINE WAS ~.93)
print("The accuracy is:", pipeline.score(X_val, y_val))

The accuracy is: 0.9467657415365847


In [16]:
# AVERAGE RECALL SCORE
from sklearn.metrics import balanced_accuracy_score

"""
The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets.
It is defined as the average of recall obtained on each class.
"""
''' Recall literally is how many of the true positives were recalled (found), i.e. how many of the correct hits were also found. '''

y_pred = [1] * len(y_val)
print("The average recall with just guessing 1 for every instance is:", balanced_accuracy_score(y_val, y_pred))

The average recall with just guessing 1 for every instance is: 0.3333333333333333


In [17]:
y_pred = pipeline.predict(X_val)
print("The average recall is with the model:", balanced_accuracy_score(y_val, y_pred))

The average recall is with the model: 0.4435007370268591


# **Exploration**

In [18]:
"""
PLANS:

- Missing dates should be averaged
- Date needs to be normalized
- SMOTE CLASS BALANCER
- Check differences in sentiment between the 6 sites and plot them (normalized)

- Feature creation with the comments
    - words used
    - length
    - etc.
    spacy sklearn
    count vectorizor
    tfidf vectorizor
    https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
"""
df.shape

(225414, 8)

# **Charts**

In [0]:
import matplotlib.pyplot as plt