## Prediction of Wine Shop Varieties

In [1]:
#All Imports
import os
import re
import string
import pickle
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
plt.rcParams["figure.figsize"] = (10,7)
%matplotlib inline

In [13]:
## Get File details in the current working directory
def find_files():
    cwd=os.getcwd()
    files=os.listdir(cwd)
    return files

In [None]:
#Read_the_data
#train_df=pd.read_csv("Data/train.csv")
#test_df=pd.read_csv("Data/test.csv")
#df1=train_df.copy()

## Data Cleaning

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df.head()

## Plot Country Wise data

In [None]:
wn_country = train_df.groupby(['country']).count()[["province"]].apply(lambda x : 100*x/x.sum())
wn_country.head()

In [None]:
#Plot Normalized Data

wn_country.plot(kind='bar')


## Classification based on prices by country

In [None]:
mx=train_df.iloc[train_df["price"].idxmax(),1]
mn=train_df.iloc[train_df["price"].idxmin(),1]
print("Cheapest wines is sold  in \"{}\" - {:g}.\n".format(mn,min(train_df["price"])))
print("Costliest wines is sold  in \"{}\" - {:g}.\n".format(mx,max(train_df["price"])))

In [None]:

price_df = train_df.groupby(['country']).mean()[["price"]]
max_price=price_df["price"].idxmax()
min_price=price_df["price"].idxmin()
print("Cheaper wines are sold (On Average) in \"{}\" - {:g}$.\n".format(min_price,min(price_df["price"])))
print("Costlier wines are sold (On Average) in \"{}\" - {:g}$.\n".format(max_price,max(price_df["price"])))

In [None]:
price_df.plot(kind="bar",color="orange")

In [None]:
## Ratings by Country

In [None]:

rating_df = train_df.groupby(['country']).mean()[["points"]]
max_rate=rating_df["points"].idxmax()
min_rate=rating_df["points"].idxmin()
print("Bad ratings are given (On Average) in \"{}\" - {:g}.\n".format(min_rate,min(rating_df["points"])))
print("Good ratings are given (On Average) in \"{}\" - {:g}.\n".format(max_rate,max(rating_df["points"])))

In [None]:
rating_df.plot()

## Most Common Winery

In [None]:
#Function to get the most common item

def get_most_common_item(df):
    c=Counter(df)
    return c.most_common()

In [None]:
cw=get_most_common_item(train_df["winery"])
print("Most wine trade is from \"{}\" with a count of {}.\n".format(cw[0][0],cw[0][1]))

In [None]:
items_by_winery = train_df.groupby(['country'])[['winery']].count().apply(lambda x : 100*x/x.sum())
items_by_winery.head()

In [None]:
plt.xticks(rotation=90)
plt.xlabel("Country")
plt.ylabel("Winery Count")
plt.plot(items_by_winery)

## Insights
- From the above plot we can say that US has higest number of winerys

In [None]:
vc=get_most_common_item(train_df["variety"])
print("Most wine trade is from \"{}\" with a count of {}.\n".format(vc[0][0],vc[0][1]))

In [None]:
items_by_variety = train_df.groupby(['country'])[['variety']].size()
items_by_variety.head()

In [None]:
#Plots
plt.xticks(rotation=90)
plt.xlabel("Country")
plt.ylabel("Variety Count")
plt.plot(items_by_variety)

## Insights
- It is sure that the country with highest number of winerys has more number of varieties than others.
- US has large amount of varieties of wines.

## Plot on No of provinces in each country.

In [None]:

items_by_c=train_df.groupby(['country','province'])['variety'].unique()
tst=items_by_c.index
location_cluster={}
for i in tst:
    if(i[0]) in location_cluster:
        
        location_cluster[i[0]]+=1
    else:
        location_cluster[i[0]]=1
        
plt.bar(location_cluster.keys(), location_cluster.values(), color='g')
plt.xticks(rotation=90)
plt.show()

## No of Varieties classification based on country

In [None]:
items_by_v=train_df.groupby(['country'])['variety'].count()
items_by_v.head()

In [None]:
cnt=1
var_cnt=[]
for i in items_by_v.index:
    var_cnt.append(i)
    cnt+=2
x = var_cnt
y = [0]*len(x)
n=len(y)
s = [i/n for i in items_by_v]
plt.scatter(x,y,s=s,color='r')
plt.xticks(rotation=90)
plt.show()

- Plot which shows the total number of varieties by country.
- Larger the size of circle, more no of varieties in that country

## Drop the columns which are not necesssary

In [None]:
train_df.columns

In [26]:
def drop_cols(df,cols):
    return df.drop(columns=cols)

In [None]:
#Remove the Unnecessary data_columns
cleaned_df=drop_cols(train_df,["designation","review_title","user_name","region_2","country","region_1","province"])

## Dealing with null values

In [None]:
cleaned_df.isnull().sum()

- As we have null values in price columnwe impute that particluar column

In [None]:
def impute(df):
    x=df.values
    x=x.reshape(-1,1)
    imp=SimpleImputer(strategy='mean')
    imp.fit(x)
    return (imp.transform(x))

In [None]:
price1=impute(cleaned_df['price'])
cleaned_df=drop_cols(cleaned_df,["price"])
cleaned_df['price']=price1

In [None]:
cleaned_df.isnull().sum()

## Review Description
- we apply sentiment analysis to the reviews.
- we take the polarity value and replace with text.

In [None]:
# Apply a first round of text cleaning techniques
#make All text to lowecase,remove numericals
def clean_review(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
def text_processing(cleaned_df):
    #Apply Clean review function
    review=cleaned_df['review_description'].apply(clean_review)
    pol = lambda x: TextBlob(x).sentiment.polarity 
    cleaned_df['review_description']=cleaned_df['review_description'].apply(pol)
    scale=lambda x: x*100
    cleaned_df['review_description']=cleaned_df['review_description'].apply(scale)
    return cleaned_df

- Adding a condition not to take time, as the data upto text_processing is stored txt_df.pkl file
- So Instead of compiling the code again we use the files saved.

In [None]:

if("Ztrain_txt_df.pkl" not in find_files()):
    df_text=text_processing(cleaned_df)
    df_text.head()

In [None]:
df_text.to_pickle('Ztrain_txt_df.pkl')

In [24]:
df_text=pd.read_pickle('Ztrain_txt_df.pkl')
df_text.head()

Unnamed: 0,review_description,points,winery,variety,price
0,10.833333,83,Andrew Peace,Chardonnay,10.0
1,-5.833333,89,North by Northwest,Red Blend,15.0
2,-20.416667,94,Renato Ratti,Nebbiolo,80.0
3,41.666667,87,Domaine l'Ancienne Cure,Bordeaux-style White Blend,22.0
4,26.547619,88,Château du Cèdre,Malbec,33.0


In [27]:
y_train=df_text['variety']
df_text=drop_cols(df_text,'variety')

## Dealing with Categorical Data

## Encoding the target- variety column




In [28]:
lbl=LabelEncoder()
y_train=lbl.fit_transform(y_train)
y_train

array([ 5, 17, 11, ...,  6,  0,  3])

## Encoding Winery data

In [None]:
wine=df_text['winery']
wine=wine.to_numpy()

In [None]:
df_text.columns

  
  ## Formula used
  
  -  \begin{equation*} probabilty =  \frac {sum \ of\ the\ items}{total\ items} \end{equation*}

In [None]:
#Function to calculate basic probability
def cal_proba(X):
    cnts={}
    for key in X:
        if key in cnts:
            cnts[key]+=1
        else:
            cnts[key]=1
    lnth=X.shape[0]
    
    for i in cnts:
        cnts[i]=cnts[i]/lnth
    return cnts

In [None]:
#Replace the values with probabiites to calculate the coreclarion between price
mps=cal_proba(wine)
df_text["winery"]=df_text["winery"].map(mps)
df_text.head()

In [None]:
sample=df_text.loc[:,['winery', 'price']]
sample.head()

In [None]:
sample.corr() 

In [None]:
df_text.to_pickle("Zdf_text2.pkl")

## Cleaning The testData

In [None]:
test_df.head()

In [None]:
clean_tst=drop_cols(test_df,["designation","review_title","user_name","region_2","country","region_1","province"])
clean_tst.head()

In [None]:
if("Ztest_df.pkl" not in find_files()):
    df_test=text_processing(clean_tst)
    df_test.head()

In [None]:
df_test.to_pickle("Ztest_df.pkl")

In [None]:
df_test=pd.read_pickle('Ztest_df.pkl')
df_test.head()

In [None]:
tst_wine=df_test['winery']
tst_wine=tst_wine.to_numpy()

In [None]:
#Replace the values with probabiites to calculate the coreclarion between price
mps2=cal_proba(tst_wine)
df_test["winery"]=df_test["winery"].map(mps2)
df_test.head()

In [None]:
## Saving cleaned to test file

In [None]:
df_test.isnull().sum()

In [None]:
df_test['price']=impute(df_test['price'])
df_test.head()

In [None]:
df_test.isnull().sum()

In [None]:
df_test.to_pickle("Ztst_text2.pkl")

In [29]:
#TEST DATA
df_test=pd.read_pickle("Ztst_text2.pkl")
df_test.head()

Unnamed: 0,review_description,points,price,winery
0,24.910714,88,35.0,0.000242
1,50.0,90,60.0,0.000145
2,8.4375,87,38.0,0.000145
3,10.416667,91,20.0,9.7e-05
4,-4.166667,90,49.0,9.7e-05


In [30]:
#TRAIN DATA
df_text=pd.read_pickle("Zdf_text2.pkl")
df_text.head()

Unnamed: 0,review_description,points,winery,price
0,10.833333,83,2.4e-05,10.0
1,-5.833333,89,0.00023,15.0
2,-20.416667,94,0.000145,80.0
3,41.666667,87,0.000109,22.0
4,26.547619,88,0.000339,33.0


## Training using SVM 

In [7]:
#Scaling the data
scale=StandardScaler()
X_train=scale.fit_transform(df_text)
X_test=scale.fit_transform(df_test)

## Hyper parameters to tune

In [8]:
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]}]

In [None]:
mdl_name = "Zmodel.pkl"

In [None]:
if mdl_name not in find_files():
    print("Training Started......\n")
    model=GridSearchCV(SVC(), params_grid, cv=5)
    print(".\n.\n.")
    model.fit(X_train, y_train)
    print(".\n.\n.\n.\n")
    print("Training Completed Succesfully\n")

## Random Forest

In [9]:
rfc=RandomForestClassifier(random_state=42)


In [10]:
param_grid2 = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [11]:
mdl_name2 = "Zmodel2.pkl"

In [32]:
if mdl_name2 not in find_files():
    print("Training Started......\n")
    model2=GridSearchCV(rfc, param_grid2, cv=5)
    print(".\n.\n.")
    model2.fit(X_train, y_train)
    print(".\n.\n.\n.\n")
    print("Training Completed Succesfully\n")

Training Started......

.
.
.
.
.
.
.

Training Completed Succesfully



## Save Model into a file
- As I have stored into a file, we don't train it again.
- We add the condition and open the saved model

In [None]:

#if mdl_name not in find_files():
with open(mdl_name, 'wb') as file:  
    pickle.dump(model, file)
    


In [33]:
with open(mdl_name2, 'wb') as file:  
    pickle.dump(model2, file)

In [None]:
## Gradient boosting

In [34]:
mdl_name3 = "Zmodel3.pkl"

In [37]:
param_grid3 = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

In [38]:
gdbc=GradientBoostingClassifier(random_state=42)

In [None]:
if mdl_name3 not in find_files():
    print("Training Started......\n")
    model3=GridSearchCV(gdbc, param_grid3, cv=5)
    print(".\n.\n.")
    model3.fit(X_train, y_train)
    print(".\n.\n.\n.\n")
    print("Training Completed Succesfully\n")

Training Started......

.
.
.


In [None]:
with open(mdl_name3, 'wb') as file:  
    pickle.dump(model3, file)

## Loading the file

In [None]:

with open(mdl_name, 'rb') as file:  
    model = pickle.load(file)


with open(mdl_name2, 'rb') as file:  
    model2 = pickle.load(file)


## Metrics

In [None]:
#Accuracy Score
print('Best score for training data:', model.best_score_,"\n") 

In [None]:
# View the accuracy score


# View the best parameters for the model found using grid search
print('Best C:',model.best_estimator_.C,"\n") 
print('Best Kernel:',model.best_estimator_.kernel,"\n")
print('Best Gamma:',model.best_estimator_.gamma,"\n")

## Final Model after hyperparameters tuning used for prediction

In [None]:
final_model = model.best_estimator_
y_pred = final_model.predict(X_test)
y_predl = list(lbl.inverse_transform(y_pred))

In [None]:
print("Training set score for SVM: %f" % final_model.score(X_train , y_train))