In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
import math
from nltk.corpus import stopwords

In [2]:
#apply description pre processing

#arguments {stemming = [True, False]}
def preProcess(stemming):
    if(stemming):
        data = pd.read_csv("stemmed.csv")
    elif( not stemming):
        data = pd.read_csv("dataAfterPOS.csv")
    data.drop(["region_2"],inplace=True,axis=1)
    #remove stopwords
    stop = stopwords.words('english')

    data["stopwords"] = data["description"].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    #remove commas, fullstops...
    data['description'] = data['description'].str.replace('[^\w\s]','')
    
    #remove most frequent and less frequent bcs useless
    freq = pd.Series(' '.join(data['description']).split()).value_counts()[:10]
    freq = list(freq.index)
    data['description'] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

    unfreq = pd.Series(' '.join(data["description"]).split()).value_counts()
    count = 0
    for el in unfreq: 
        if(el < 10):
            count = count + 1

    unfreqList = unfreq[-count:]
    unfreq = dict(unfreqList)  
    data["description"] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in unfreq))

    data.drop(["word_count","stopwords"],axis=1, inplace=True)
    return data

In [3]:
def computeData(data,groupedByVariety,stemming):
        if groupedByVariety:
            #take only the description with 3 words
            fullText = pd.Series(' '.join(data['description']).split()).value_counts()
            totalWords = pd.Series("".join(data["description"]).split()).value_counts().sum()
            print(totalWords)

            t1 = data.groupby(["variety"])
            t2 = pd.DataFrame({'text' : t1['description'].apply(lambda x: (x + " ").sum())}).reset_index()
            t2["word_count"] = t2["text"].apply(lambda x: len(str(x).split(" ")))
            t2 = t2.sort_values('word_count',ascending = False)
            #t2.iloc[0]["text"]
            #t2
            varietyDict = {}
            for idx, row in t2.iterrows():
                varietyDict[row["variety"]] = pd.Series(row['text'].split()).value_counts()
            tfDict = {}
            for var in varietyDict:
                temp = {}
                for word in varietyDict[var].keys():
                    temp[word] = varietyDict[var][word] / fullText[word]   #compute tf= numero di occorrenze della parola nel documento /numero di occorrenze della stessa parola in tutti i documenti 
                tfDict[var] = temp
            #tfDict["Pinot Gris"] 
            
            desc = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfDict[row["variety"]][word])
                desc.append(modded)

            data["description_values_tf"] = desc  #here you can add as a column the results for term frequency

            
            varList = data["variety"].unique().tolist()

            def computeIdf1(dataset):
                a = []
                idf = {}
                for idx,row in dataset.iterrows():
                    el = row["variety"]
                    docList = data[data["variety"] == el]
                    N = len(docList)
                    idfDict = {}
                    temp = t2[t2["variety"] == el]
                    idfDict = dict.fromkeys(temp.iloc[0]["text"].split(),0)

                    for idx,row in docList.iterrows():
                        for word in row["description"].split():
                            idfDict[word] += 1

                    for word, val in idfDict.items():
                        idfDict[word] = math.log10(N/float(val))
                    a.append([el,N,idfDict])    
                    idf[el] = idfDict
                return idf, a

            idf,a = computeIdf1(t2)

            tfIdf = {}
            for var in varietyDict:
                temp = {}
                for word in tfDict[var].keys():
                    temp[word] = tfDict[var][word] * idf[var][word]
                tfIdf[var] = temp 

            desc1 = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfIdf[row["variety"]][word])
                desc1.append(modded)

            data["description_tfIdf"] = desc1
            if stemming:
                data.to_csv("stemmedData_tfIdf_groupedByVariety.csv", encoding='utf-8', index=False)
            else: 
                data.to_csv("notStemmedData_tfIdf_groupedByVariety.csv", encoding='utf-8', index=False)
    #if we want to work on the sum of the descriptions as a whole document
        elif not groupByVariety:
            fullText = pd.Series(' '.join(data['description']).split()).value_counts()
            #getValueCounts
            tfData = {}
            for idx, row in data.iterrows(): 
                tfData[idx] = pd.Series(row['description'].split()).value_counts();

            #compute tf for every word
            tf1 = {}
            for idx in tfData.keys():
                tf1[idx] = {}
                for word in tfData[idx].keys():
                    tf1[idx][word] = tfData[idx][word] / fullText[word]

            desc = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tf1[idx][word])
                desc.append(modded)

            data["description_values_tf"] = desc  #here you can add as a column the results for term frequency

            #idf
            docCount = dict.fromkeys(fullText.keys(),0)
            idfDict1 = {}
            N = len(data)                                #numero di documenti #dizionario contenente tutte le parole presenti 
            idfData = dict(tfData)
            for el in idfData:     #per ogni descrizione
                for word,val in idfData[el].items():
                    docCount[word] = docCount[word] + 1

            for word in fullText.keys():
                idfDict1[word] = math.log10(N/docCount[word])  

            #compute tf-idf
            tfIdf1 = {}
            for idx in tfData.keys():
                tfIdf1[idx] = {}
                for word in tfData[idx].keys():
                    val = tf1[idx][word] * idfDict1[word]
                    tfIdf1[idx][word] = val
            #tfIdf1[0]

            desc1 = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfIdf1[idx][word])
                desc1.append(modded)
            #desc1[0]
            data["description_values_tfIdf"] = desc1
            if stemming:
                data.to_csv("stemmedData_tf_tfIdf_fullDataset.csv", encoding='utf-8', index=False)
            else: 
                data.to_csv("notStemmedData_tf_tfIdf_fullDataset.csv", encoding='utf-8', index=False)

In [4]:
stemming = True #if True apply stemming 
data = preProcess(stemming)
data

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
0,Italy,tropic broom brimston herb express appl citru ...,Vulkà Bianco,87,17.000000,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,fruiti firm juici red berri drinkabl,Avidagos,87,15.000000,Douro,unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,tart snappi lime flesh domin green pineappl po...,unknown,87,14.000000,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,pineappl rind lemon pith orang blossom opul no...,Reserve Late Harvest,87,13.000000,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,much regular bottl rough tannic rustic earthi ...,Vintner's Reserve Wild Child Block,87,65.000000,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,blackberri typic navarran whiff green herb cas...,Ars In Vitro,87,15.000000,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,bright inform candi berri white pepper savori ...,Belsito,87,16.000000,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
7,France,dri restrain offer profus firm textur much food,unknown,87,24.000000,Alsace,Alsace,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
8,Germany,savori thyme note accent preserv peach brisk o...,Shine,87,12.000000,Rheinhessen,unknown,Anna Lee C. Iijima,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel
9,France,great depth fresh appl pear touch spice dri cr...,Les Natures,87,27.000000,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [7]:
groupByVariety = False #if true apply grouping by variety for the description analysis
computeData(data,groupByVariety,stemming)

In [8]:
#pd.read_csv("stemmedData_tfIdf_groupedByVariety.csv")
#pd.read_csv("notStemmedData_tfIdf_groupedByVariety.csv")
a =pd.read_csv("stemmedData_tf_tfIdf_fullDataset.csv")
#pd.read_csv("notStemmedData_tf_tfIdf_fullDataset.csv")

In [9]:
import ast
a.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,description_values_tf,description_tfIdf,description_values_tfIdf
0,Italy,tropic broom brimston herb express appl citru ...,Vulkà Bianco,87,17.0,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"[0.000304228780042592, 0.0058823529411764705, ...","[0.04885833490039738, 0.3057715203598046, 0.38...","[0.0004757190717126231, 0.016706767017015508, ..."
1,Portugal,fruiti firm juici red berri drinkabl,Avidagos,87,15.0,Douro,unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"[9.255831173639393e-05, 0.00011741223435481978...","[0.03228609409338993, 0.03883847038764645, 0.0...","[9.783838242588247e-05, 0.00013509834598890726..."


In [None]:
#make the string array real array

In [10]:
a['description_values_tf'] = a['description_values_tf'].map(ast.literal_eval)

In [11]:
a['description_values_tfIdf'] = a['description_values_tfIdf'].map(ast.literal_eval)

In [None]:
#add new colum

In [12]:
a['new_1']=0

In [13]:
a['new_2']=0

In [14]:
a['new_3']=0

In [15]:
a

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,description_values_tf,description_tfIdf,description_values_tfIdf,new_1,new_2,new_3
0,Italy,tropic broom brimston herb express appl citru ...,Vulkà Bianco,87,17.000000,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"[0.000304228780042592, 0.0058823529411764705, ...","[0.04885833490039738, 0.3057715203598046, 0.38...","[0.0004757190717126231, 0.016706767017015508, ...",0,0,0
1,Portugal,fruiti firm juici red berri drinkabl,Avidagos,87,15.000000,Douro,unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"[9.255831173639393e-05, 0.00011741223435481978...","[0.03228609409338993, 0.03883847038764645, 0.0...","[9.783838242588247e-05, 0.00013509834598890726...",0,0,0
2,US,tart snappi lime flesh domin green pineappl po...,unknown,87,14.000000,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,"[0.00019542700801250732, 0.0017889087656529517...","[0.019578555849338562, 0.03787011890072908, 0....","[0.00026771459783581997, 0.004155970494426631,...",0,0,0
3,US,pineappl rind lemon pith orang blossom opul no...,Reserve Late Harvest,87,13.000000,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,"[0.00026616981634282674, 0.001122334455667789,...","[0.09546425350974236, 0.20805243488881586, 0.0...","[0.0003992075979441475, 0.0023901067009712016,...",0,0,0
4,US,much regular bottl rough tannic rustic earthi ...,Vintner's Reserve Wild Child Block,87,65.000000,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,"[0.0005330490405117271, 0.0029850746268656717,...","[0.1711900090314572, 0.3208454187539189, 0.232...","[0.000961190242813436, 0.0076181652966955905, ...",0,0,0
5,Spain,blackberri typic navarran whiff green herb cas...,Ars In Vitro,87,15.000000,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,"[7.586102260658473e-05, 0.0011467889908256881,...","[6.411000151830199e-05, 0.0009691491284567166,...","[7.347467165041128e-05, 0.0024450454003376357,...",0,0,0
6,Italy,bright inform candi berri white pepper savori ...,Belsito,87,16.000000,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,"[0.00010636034886194426, 0.002890173410404624,...","[0.0006457325944628814, 0.007391639311866063, ...","[0.00011788180277043133, 0.007320174043923803,...",0,0,0
7,France,dri restrain offer profus firm textur much food,unknown,87,24.000000,Alsace,Alsace,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,"[5.8513750731421886e-05, 0.003484320557491289,...","[0.00816977484536169, 0.06847392686501436, 0.0...","[5.018527340918058e-05, 0.009103545008839435, ...",0,0,0
8,Germany,savori thyme note accent preserv peach brisk o...,Shine,87,12.000000,Rheinhessen,unknown,Anna Lee C. Iijima,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel,"[0.00021263023601956197, 0.0010976948408342481...","[0.006270635180525164, 0.0032548866092490802, ...","[0.00029989328362539403, 0.0023236439325331304...",0,0,0
9,France,great depth fresh appl pear touch spice dri cr...,Les Natures,87,27.000000,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,"[0.00021635655560363478, 0.0004355400696864111...","[0.015969532708426708, 0.02142619849953519, 0....","[0.00030798569570626466, 0.0007452714695774928...",0,0,0


In [None]:
#check the max lenght

In [None]:
max_tf = 0
max_tfIdf = 0
for index, row in a.iterrows(): 
       #print(len(row['description_values_tf']))
        len_row_tf = len(row['description_values_tf'])
        len_row_tfIdf = len(row['description_values_tfIdf'])
        
        if (len_row_tf > max_tf):
            max_tf = len_row_tf
            
        if (len_row_tfIdf > max_tfIdf):
            max_tfIdf = len_row_tfIdf
            
        # print(len(row[description_values_tfIdf]))


In [None]:
#take the tre hight value and put in the colum

In [16]:
import heapq

for index, row in a.iterrows(): 
    #print(row['description_values_tf'])
    #z = np.partition(row['description_values_tf'], 2)[:3]
    x = heapq.nlargest(3, row['description_values_tf'])
    try:
        a.loc[index,'new_1'] = x[0]
        try:
            a.loc[index,'new_2'] = x[1]
            
            try:
                a.loc[index,'new_3'] = x[2]
                pass
            except IndexError:
                a.loc[index,'new_3'] = 0
                pass
            
        except IndexError:
            a.loc[index,'new_2'] = 0
            a.loc[index,'new_3'] = 0
            pass
    
    except IndexError:
        a.loc[index,'new_1'] = 0
        a.loc[index,'new_2'] = 0
        a.loc[index,'new_3'] = 0
        pass


    


    

In [44]:
a.to_csv("stemmedData_tf_fullDataset_parsed.csv", encoding='utf-8', index=False)


In [43]:
a.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,description_values_tf,description_tfIdf,description_values_tfIdf,new_1,new_2,new_3
0,Italy,tropic broom brimston herb express appl citru ...,Vulkà Bianco,87,17.0,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"[0.000304228780042592, 0.0058823529411764705, ...","[0.04885833490039738, 0.3057715203598046, 0.38...","[0.0004757190717126231, 0.016706767017015508, ...",0.018182,0.005882,0.00064
1,Portugal,fruiti firm juici red berri drinkabl,Avidagos,87,15.0,Douro,unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"[9.255831173639393e-05, 0.00011741223435481978...","[0.03228609409338993, 0.03883847038764645, 0.0...","[9.783838242588247e-05, 0.00013509834598890726...",0.001109,0.000117,0.000113


In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [23]:
X = a.loc[:,['new_1','new_2','new_3']]

In [24]:
Y=a['points'].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [26]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV

lr = LinearRegression()
lr.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [27]:
y_test_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
rmse

3.0654640503416006

In [32]:
features = ['new_1','new_2','new_3']


In [41]:
# Import basic libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from ggplot import *
%matplotlib inline
from sklearn.model_selection import KFold
df=a
classifier=lr

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp


ImportError: cannot import name 'Timestamp'

In [45]:
kf = KFold(n_splits=3,random_state=42,shuffle=True)

fold = []
scr = []

for i,(train_index, test_index) in enumerate(kf.split(df)):
    training = df.iloc[train_index,:]
    valid = df.iloc[test_index,:]
    feats = training[features] #defined above
    label = training['price']
    valid_feats = valid[features]
    valid_label = valid['price']
    lr.fit(feats,label) #it is the last one we run, the best one
    pred = lr.predict(valid_feats)
    score = mean_squared_error(y_true = valid_label, y_pred = pred)
    #score = accuracy_score(y_true = valid_label, y_pred = pred)
    fold.append(i+1)
    scr.append(score)
    
#create a small df with the scores
performance = pd.DataFrame({'Score':scr,'Fold':fold})
# let's see what we have with ggplot
print(performance)
#g = ggplot(performance,aes(x='Fold',y='Score')) + geom_point() + geom_line()
#print(g)


KeyboardInterrupt: 