In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
import math
from nltk.corpus import stopwords

In [None]:
#apply description pre processing

#arguments {stemming = [True, False]}
def preProcess(stemming):
    if(stemming):
        data = pd.read_csv("stemmed.csv")
    elif( not stemming):
        data = pd.read_csv("dataAfterPOS.csv")
    data.drop(["region_2"],inplace=True,axis=1)
    #remove stopwords
    stop = stopwords.words('english')

    data["stopwords"] = data["description"].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    #remove commas, fullstops...
    data['description'] = data['description'].str.replace('[^\w\s]','')
    
    #remove most frequent and less frequent bcs useless
    freq = pd.Series(' '.join(data['description']).split()).value_counts()[:10]
    freq = list(freq.index)
    data['description'] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

    unfreq = pd.Series(' '.join(data["description"]).split()).value_counts()
    count = 0
    for el in unfreq: 
        if(el < 10):
            count = count + 1

    unfreqList = unfreq[-count:]
    unfreq = dict(unfreqList)  
    data["description"] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in unfreq))

    data.drop(["word_count","stopwords"],axis=1, inplace=True)
    return data

In [None]:
def computeData(data,groupedByVariety,stemming):
        if groupedByVariety:
            #take only the description with 3 words
            fullText = pd.Series(' '.join(data['description']).split()).value_counts()
            totalWords = pd.Series("".join(data["description"]).split()).value_counts().sum()
            print(totalWords)

            t1 = data.groupby(["variety"])
            t2 = pd.DataFrame({'text' : t1['description'].apply(lambda x: (x + " ").sum())}).reset_index()
            t2["word_count"] = t2["text"].apply(lambda x: len(str(x).split(" ")))
            t2 = t2.sort_values('word_count',ascending = False)
            #t2.iloc[0]["text"]
            #t2
            varietyDict = {}
            for idx, row in t2.iterrows():
                varietyDict[row["variety"]] = pd.Series(row['text'].split()).value_counts()
            tfDict = {}
            for var in varietyDict:
                temp = {}
                for word in varietyDict[var].keys():
                    temp[word] = varietyDict[var][word] / fullText[word]   #compute tf= numero di occorrenze della parola nel documento /numero di occorrenze della stessa parola in tutti i documenti 
                tfDict[var] = temp
            #tfDict["Pinot Gris"] 
            varList = data["variety"].unique().tolist()

            def computeIdf1(dataset):
                a = []
                idf = {}
                for idx,row in dataset.iterrows():
                    el = row["variety"]
                    docList = data[data["variety"] == el]
                    N = len(docList)
                    idfDict = {}
                    temp = t2[t2["variety"] == el]
                    idfDict = dict.fromkeys(temp.iloc[0]["text"].split(),0)

                    for idx,row in docList.iterrows():
                        for word in row["description"].split():
                            idfDict[word] += 1

                    for word, val in idfDict.items():
                        idfDict[word] = math.log10(N/float(val))
                    a.append([el,N,idfDict])    
                    idf[el] = idfDict
                return idf, a

            idf,a = computeIdf1(t2)

            tfIdf = {}
            for var in varietyDict:
                temp = {}
                for word in tfDict[var].keys():
                    temp[word] = tfDict[var][word] * idf[var][word]
                tfIdf[var] = temp 

            desc1 = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfIdf[row["variety"]][word])
                desc1.append(modded)

            data["description_tfIdf"] = desc1
            if stemming:
                data.to_csv("stemmedData_tfIdf_groupedByVariety.csv", encoding='utf-8', index=False)
            else: 
                data.to_csv("notStemmedData_tfIdf_groupedByVariety.csv", encoding='utf-8', index=False)
    #if we want to work on the sum of the descriptions as a whole document
        elif not groupByVariety:
            fullText = pd.Series(' '.join(data['description']).split()).value_counts()
            #getValueCounts
            tfData = {}
            for idx, row in data.iterrows(): 
                tfData[idx] = pd.Series(row['description'].split()).value_counts();

            #compute tf for every word
            tf1 = {}
            for idx in tfData.keys():
                tf1[idx] = {}
                for word in tfData[idx].keys():
                    tf1[idx][word] = tfData[idx][word] / fullText[word]

            desc = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tf1[idx][word])
                desc.append(modded)

            data["description_values_tf"] = desc 

            #idf
            docCount = dict.fromkeys(fullText.keys(),0)
            idfDict1 = {}
            N = len(data)                                #numero di documenti #dizionario contenente tutte le parole presenti 
            idfData = dict(tfData)
            for el in idfData:     #per ogni descrizione
                for word,val in idfData[el].items():
                    docCount[word] = docCount[word] + 1

            for word in fullText.keys():
                idfDict1[word] = math.log10(N/docCount[word])  

            #compute tf-idf
            tfIdf1 = {}
            for idx in tfData.keys():
                tfIdf1[idx] = {}
                for word in tfData[idx].keys():
                    val = tf1[idx][word] * idfDict1[word]
                    tfIdf1[idx][word] = val
            #tfIdf1[0]

            desc1 = []
            for idx, row in data.iterrows():
                modded = []
                temp = row["description"].split()
                for word in temp:
                    modded.append(tfIdf1[idx][word])
                desc1.append(modded)
            #desc1[0]
            data["description_values_tfIdf"] = desc1
            if stemming:
                data.to_csv("stemmedData_tf_tfIdf_fullDataset.csv", encoding='utf-8', index=False)
            else: 
                data.to_csv("notStemmedData_tf_tfIdf_fullDataset.csv", encoding='utf-8', index=False)

In [None]:
stemming = True #if True apply stemming 
data = preProcess(stemming)
data

In [None]:
groupByVariety = True #if true apply grouping by variety for the description analysis
computeData(data,groupByVariety,stemming)

In [None]:
#pd.read_csv("stemmedData_tfIdf_groupedByVariety.csv")
#pd.read_csv("notStemmedData_tfIdf_groupedByVariety.csv")
#pd.read_csv("stemmedData_tf_tfIdf_fullDataset.csv")
#pd.read_csv("notStemmedData_tf_tfIdf_fullDataset.csv")