In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
import math
from nltk.corpus import stopwords

In [2]:
#apply description pre processing

#arguments {stemming = [True, False]}
def preProcess(stemming):
    if(stemming):
        data = pd.read_csv("stemmed.csv")
    elif( not stemming):
        data = pd.read_csv("dataAfterPOS.csv")
    data.drop(["region_2"],inplace=True,axis=1)
    #remove stopwords
    stop = stopwords.words('english')

    data["stopwords"] = data["description"].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    #remove commas, fullstops...
    data['description'] = data['description'].str.replace('[^\w\s]','')
    
    #remove most frequent and less frequent bcs useless
    freq = pd.Series(' '.join(data['description']).split()).value_counts()[:10]
    freq = list(freq.index)
    data['description'] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

    unfreq = pd.Series(' '.join(data["description"]).split()).value_counts()
    count = 0
    for el in unfreq: 
        if(el < 10):
            count = count + 1

    unfreqList = unfreq[-count:]
    unfreq = dict(unfreqList)  
    data["description"] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in unfreq))

    data.drop(["word_count","stopwords"],axis=1, inplace=True)
    data["word_count"] = data["description"].apply(lambda x: len(str(x).split(" ")))
    return data

In [3]:
def computeData(data,groupedByVariety,stemming):
        if groupedByVariety:
            #take only the description with 3 words
            fullText = pd.Series(' '.join(data['description']).split()).value_counts()
            totalWords = pd.Series("".join(data["description"]).split()).value_counts().sum()
            print(totalWords)

            t1 = data.groupby(["variety"])
            t2 = pd.DataFrame({'text' : t1['description'].apply(lambda x: (x + " ").sum())}).reset_index()
            t2["word_count"] = t2["text"].apply(lambda x: len(str(x).split(" ")))
            t2 = t2.sort_values('word_count',ascending = False)
            #t2.iloc[0]["text"]
            #t2
            varietyDict = {}
            for idx, row in t2.iterrows():
                varietyDict[row["variety"]] = pd.Series(row['text'].split()).value_counts()
            tfDict = {}
            for var in varietyDict:
                temp = {}
                for word in varietyDict[var].keys():
                    temp[word] = varietyDict[var][word] / fullText[word]   #compute tf= numero di occorrenze della parola nel documento /numero di occorrenze della stessa parola in tutti i documenti 
                tfDict[var] = temp
            #tfDict["Pinot Gris"] 
            
            desc = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfDict[row["variety"]][word])
                desc.append(modded)

            data["description_values_tf"] = desc  #here you can add as a column the results for term frequency

            
            varList = data["variety"].unique().tolist()

            def computeIdf1(dataset):
                a = []
                idf = {}
                for idx,row in dataset.iterrows():
                    el = row["variety"]
                    docList = data[data["variety"] == el]
                    N = len(docList)
                    idfDict = {}
                    temp = t2[t2["variety"] == el]
                    idfDict = dict.fromkeys(temp.iloc[0]["text"].split(),0)

                    for idx,row in docList.iterrows():
                        for word in row["description"].split():
                            idfDict[word] += 1

                    for word, val in idfDict.items():
                        idfDict[word] = math.log10(N/float(val))
                    a.append([el,N,idfDict])    
                    idf[el] = idfDict
                return idf, a

            idf,a = computeIdf1(t2)

            tfIdf = {}
            for var in varietyDict:
                temp = {}
                for word in tfDict[var].keys():
                    temp[word] = tfDict[var][word] * idf[var][word]
                tfIdf[var] = temp 

            desc1 = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfIdf[row["variety"]][word])
                desc1.append(modded)

            data["description_tfIdf"] = desc1
            #get word count
            data["word_count"] = data["description"].apply(lambda x: len(str(x).split(" ")))
            
            try:
                data.drop(["stopwords"],axis=1, inplace=True)
            except: 
                print("no stopwords found (it's ok don't worry)")
            
            if stemming:
                data.to_csv("stemmedData_tfIdf_groupedByVariety.csv", encoding='utf-8', index=False)
            else: 
                data.to_csv("notStemmedData_tfIdf_groupedByVariety.csv", encoding='utf-8', index=False)
    #if we want to work on the sum of the descriptions as a whole document
        elif not groupByVariety:
            fullText = pd.Series(' '.join(data['description']).split()).value_counts()
            #getValueCounts
            tfData = {}
            for idx, row in data.iterrows(): 
                tfData[idx] = pd.Series(row['description'].split()).value_counts();

            #compute tf for every word
            tf1 = {}
            for idx in tfData.keys():
                tf1[idx] = {}
                for word in tfData[idx].keys():
                    tf1[idx][word] = tfData[idx][word] / fullText[word]

            desc = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tf1[idx][word])
                desc.append(modded)

            data["description_values_tf"] = desc  #here you can add as a column the results for term frequency
            
            #idf
            docCount = dict.fromkeys(fullText.keys(),0)
            idfDict1 = {}
            N = len(data)                                #numero di documenti #dizionario contenente tutte le parole presenti 
            idfData = dict(tfData)
            for el in idfData:     #per ogni descrizione
                for word,val in idfData[el].items():
                    docCount[word] = docCount[word] + 1

            for word in fullText.keys():
                idfDict1[word] = math.log10(N/docCount[word])  

            #compute tf-idf
            tfIdf1 = {}
            for idx in tfData.keys():
                tfIdf1[idx] = {}
                for word in tfData[idx].keys():
                    val = tf1[idx][word] * idfDict1[word]
                    tfIdf1[idx][word] = val
            #tfIdf1[0]

            desc1 = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfIdf1[idx][word])
                desc1.append(modded)
            #desc1[0]
            data["description_values_tfIdf"] = desc1
            #get word count
            data["word_count"] = data["description"].apply(lambda x: len(str(x).split(" ")))
            
            try:
                data.drop(["stopwords"],axis=1, inplace=True)
            except: 
                print("no stopwords found (it's ok don't worry)")
            
            if stemming:
                data.to_csv("stemmedData_tf_tfIdf_fullDataset.csv", encoding='utf-8', index=False)
            else: 
                data.to_csv("notStemmedData_tf_tfIdf_fullDataset.csv", encoding='utf-8', index=False)

In [4]:
stemming = True #if True apply stemming 
data = preProcess(stemming)
data

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,word_count
0,Italy,tropic broom brimston herb express appl citru ...,Vulkà Bianco,87,17.000000,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,10
1,Portugal,fruiti firm juici red berri drinkabl,Avidagos,87,15.000000,Douro,unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,6
2,US,tart snappi lime flesh domin green pineappl po...,unknown,87,14.000000,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,10
3,US,pineappl rind lemon pith orang blossom opul no...,Reserve Late Harvest,87,13.000000,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,13
4,US,much regular bottl rough tannic rustic earthi ...,Vintner's Reserve Wild Child Block,87,65.000000,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,16
5,Spain,blackberri typic navarran whiff green herb cas...,Ars In Vitro,87,15.000000,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,17
6,Italy,bright inform candi berri white pepper savori ...,Belsito,87,16.000000,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,10
7,France,dri restrain offer profus firm textur much food,unknown,87,24.000000,Alsace,Alsace,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,8
8,Germany,savori thyme note accent preserv peach brisk o...,Shine,87,12.000000,Rheinhessen,unknown,Anna Lee C. Iijima,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel,12
9,France,great depth fresh appl pear touch spice dri cr...,Les Natures,87,27.000000,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,11


In [11]:
data.iloc[103]

country                                                              Chile
description              bright nose green appl citric mild oak raci co...
designation                                   Single Vineyard Falaris Hill
points                                                                  87
price                                                                   18
province                                                      Leyda Valley
region_1                                                           unknown
taster_name                                              Michael Schachner
title                    Leyda 2015 Single Vineyard Falaris Hill Chardo...
variety                                                         Chardonnay
winery                                                               Leyda
word_count                                                              16
description_values_tf    [0.08838544990427569, 0.09149010072941993, 0.1...
description_tfIdf        

In [6]:
groupByVariety = True #if true apply grouping by variety for the description analysis
computeData(data,groupByVariety,stemming)

1752815
no stopwords found (it's ok don't worry)


In [8]:
a = pd.read_csv("stemmedData_tfIdf_groupedByVariety.csv")
#pd.read_csv("notStemmedData_tfIdf_groupedByVariety.csv")
#a =pd.read_csv("stemmedData_tf_tfIdf_fullDataset.csv")
#pd.read_csv("notStemmedData_tf_tfIdf_fullDataset.csv")


In [10]:
a.iloc[103]

country                                                              Chile
description              bright nose green appl citric mild oak raci co...
designation                                   Single Vineyard Falaris Hill
points                                                                  87
price                                                                   18
province                                                      Leyda Valley
region_1                                                           unknown
taster_name                                              Michael Schachner
title                    Leyda 2015 Single Vineyard Falaris Hill Chardo...
variety                                                         Chardonnay
winery                                                               Leyda
word_count                                                              16
description_values_tf    [0.08838544990427569, 0.09149010072941993, 0.1...
description_tfIdf        

In [None]:
import ast
a.head(2)

In [None]:
#make the string array real array

In [None]:
a['description_values_tf'] = a['description_values_tf'].map(ast.literal_eval)

In [None]:
a['description_values_tfIdf'] = a['description_values_tfIdf'].map(ast.literal_eval)

In [None]:
#add new colum

In [None]:
a['new_1']=0

In [None]:
a['new_2']=0

In [None]:
a['new_3']=0

In [None]:
a.head(2)

In [None]:
#check the max lenght

In [None]:
max_tf = 0
max_tfIdf = 0
for index, row in a.iterrows(): 
       #print(len(row['description_values_tf']))
        len_row_tf = len(row['description_values_tf'])
        len_row_tfIdf = len(row['description_values_tfIdf'])
        
        if (len_row_tf > max_tf):
            max_tf = len_row_tf
            
        if (len_row_tfIdf > max_tfIdf):
            max_tfIdf = len_row_tfIdf
            
        # print(len(row[description_values_tfIdf]))


In [None]:
#take the tre hight value and put in the colum
#

In [None]:
import heapq

for index, row in a.iterrows(): 
    #print(row['description_values_tf'])
    #z = np.partition(row['description_values_tf'], 2)[:3]
    x = heapq.nlargest(3, row['description_values_tf'])#change this value to  description_values_tfIdf to get it
    try:
        a.loc[index,'new_1'] = x[0]
        try:
            a.loc[index,'new_2'] = x[1]
            
            try:
                a.loc[index,'new_3'] = x[2]
                pass
            except IndexError:
                a.loc[index,'new_3'] = 0
                pass
            
        except IndexError:
            a.loc[index,'new_2'] = 0
            a.loc[index,'new_3'] = 0
            pass
    
    except IndexError:
        a.loc[index,'new_1'] = 0
        a.loc[index,'new_2'] = 0
        a.loc[index,'new_3'] = 0
        pass


    


    

In [None]:
a.head(2)

In [None]:
a= a.drop(['description_values_tf','description_values_tfIdf'], axis=1)

In [None]:
a.head(2)

In [None]:
a.to_csv("stemmedData_tf_fullDataset_parsed.csv", encoding='utf-8', index=False)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
X = a.loc[:,['new_1','new_2','new_3']]

In [None]:
Y=a['points'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV

lr = LinearRegression()
lr.fit(X_train, y_train)


In [None]:
y_test_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
rmse

In [None]:
features = ['new_1','new_2','new_3']
