# To make the project dataframe in .csv from a .txt file

In [1]:
import csv
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from itertools import izip, tee
import datetime

### Convert to .cvs file

In [2]:
txt_file = "./beeradvocate.txt"
csv_file = "./beeradvocate_2.csv"

in_txt = csv.reader(open(txt_file, "rb"), delimiter = (':'))
out_csv = csv.writer(open(csv_file, 'wb'))

out_csv.writerows(in_txt)

### Already having a .csv file, split the table at ' : ' 
### we use %capture to avoid skipping errors at the output

In [3]:
%%capture capt 
beer = pd.read_csv ("beeradvocate_2.csv", sep =":", error_bad_lines=False, header = None, )

In [4]:
beer.head (20)


Unnamed: 0,0
0,"beer/name, Sausa Weizen"
1,"beer/beerId, 47986"
2,"beer/brewerId, 10325"
3,"beer/ABV, 5.00"
4,"beer/style, Hefeweizen"
5,"review/appearance, 2.5"
6,"review/aroma, 2"
7,"review/palate, 1.5"
8,"review/taste, 1.5"
9,"review/overall, 1.5"


### Split at ' , ' to get the values of columns and rows

In [5]:
beer['columnas'], beer ['filas'] = beer [0].str.split(',', 1).str

In [6]:
beer.head()

Unnamed: 0,0,columnas,filas
0,"beer/name, Sausa Weizen",beer/name,Sausa Weizen
1,"beer/beerId, 47986",beer/beerId,47986
2,"beer/brewerId, 10325",beer/brewerId,10325
3,"beer/ABV, 5.00",beer/ABV,5.00
4,"beer/style, Hefeweizen",beer/style,Hefeweizen


### Make each column to join after:
- First try with beer/name and beer/Id
- Repeat with every column
- Join to dataframe

In [7]:
name = beer[beer['columnas'] == 'beer/name']

In [8]:
name.head()

Unnamed: 0,0,columnas,filas
0,"beer/name, Sausa Weizen",beer/name,Sausa Weizen
13,"beer/name, Red Moon",beer/name,Red Moon
26,"beer/name, Black Horse Black Beer",beer/name,Black Horse Black Beer
39,"beer/name, Sausa Pils",beer/name,Sausa Pils
52,"beer/name, Cauldron DIPA",beer/name,Cauldron DIPA


### Change the column name to join after

In [9]:
name.rename(columns ={
    'filas':'names'
}
           ,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [10]:
names=name['names']

In [11]:
names.head()

0                Sausa Weizen
13                   Red Moon
26     Black Horse Black Beer
39                 Sausa Pils
52              Cauldron DIPA
Name: names, dtype: object

### The object has the index of the original dataset. It must be changed to the same sequence for every column. To make an index column and change the index for the column 

In [12]:
names.shape

(1586187,)

In [13]:
col_1 = pd.Series (range(0,1586187))

In [14]:
col_1.tail()

1586182    1586182
1586183    1586183
1586184    1586184
1586185    1586185
1586186    1586186
dtype: int64

In [15]:
names.index = col_1

In [16]:
names.head()

0               Sausa Weizen
1                   Red Moon
2     Black Horse Black Beer
3                 Sausa Pils
4              Cauldron DIPA
Name: names, dtype: object

In [17]:
names.tail()

1586182     The Horseman's Ale
1586183     The Horseman's Ale
1586184     The Horseman's Ale
1586185     The Horseman's Ale
1586186     The Horseman's Ale
Name: names, dtype: object

### Same for beer/Id

In [18]:
beer_ID = beer[beer['columnas'] == 'beer/beerId']
beer_ID.rename(columns={
    'filas' : 'Id'
}
              ,inplace = True)
Id=beer_ID['Id']
Id.index = col_1

### Check the shapes

In [19]:
col_1.shape

(1586187,)

In [20]:
names.shape

(1586187,)

In [21]:
Id.shape

(1586187,)

### Same for the rest

In [22]:
brewer_ID = beer[beer['columnas'] == 'beer/brewerId']
brewer_ID.rename(columns={
    'filas' : 'brewerId'
}
              ,inplace = True)
brewerId=brewer_ID['brewerId']
brewerId.index = col_1

In [23]:
ABV = beer[beer['columnas'] == 'beer/ABV']
ABV.rename(columns={
    'filas' : 'ABV'
}
              ,inplace = True)
ABV=ABV['ABV']
ABV.index = col_1

In [24]:
style = beer[beer['columnas'] == 'beer/style']
style.rename(columns={
    'filas' : 'style'
}
              ,inplace = True)
style=style['style']
style.index = col_1

In [25]:
appearance = beer[beer['columnas'] == 'review/appearance']
appearance.rename(columns={
    'filas' : 'appearance'
}
              ,inplace = True)
appearance=appearance['appearance']
appearance.index = col_1

In [26]:
aroma = beer[beer['columnas'] == 'review/aroma']
aroma.rename(columns={
    'filas' : 'aroma'
}
              ,inplace = True)
aroma=aroma['aroma']
aroma.index = col_1

In [27]:
palate = beer[beer['columnas'] == 'review/palate']
palate.rename(columns={
    'filas' : 'palate'
}
              ,inplace = True)
palate=palate['palate']
palate.index = col_1

In [28]:
taste = beer[beer['columnas'] == 'review/taste']
taste.rename(columns={
    'filas' : 'taste'
}
              ,inplace = True)
taste=taste['taste']
taste.index = col_1

In [29]:
overall = beer[beer['columnas'] == 'review/overall']
overall.rename(columns={
    'filas' : 'overall'
}
              ,inplace = True)
overall=overall['overall']
overall.index = col_1

In [30]:
time = beer[beer['columnas'] == 'review/time']
time.rename(columns={
    'filas' : 'time'
}
              ,inplace = True)
time=time['time']
time.index = col_1

In [31]:
profile_name = beer[beer['columnas'] == 'review/profileName']
profile_name.rename(columns={
    'filas' : 'profile_name'
}
              ,inplace = True)
profile_name=profile_name['profile_name']
profile_name.index = col_1

### Now join the columns to get the dataframe

In [32]:
df_beer = pd.concat([names,
                     Id,
                     brewerId,
                     ABV,
                     style,
                    appearance,
                     aroma,
                     palate,
                     taste,
                     overall,
                     time,                     
                     profile_name,
                     
                    ], axis=1, join = 'outer')

In [33]:
df_beer.head()

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name
0,Sausa Weizen,47986,10325,5.0,Hefeweizen,2.5,2.0,1.5,1.5,1.5,1234817823,stcules
1,Red Moon,48213,10325,6.2,English Strong Ale,3.0,2.5,3.0,3.0,3.0,1235915097,stcules
2,Black Horse Black Beer,48215,10325,6.5,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,1235916604,stcules
3,Sausa Pils,47969,10325,5.0,German Pilsener,3.5,3.0,2.5,3.0,3.0,1234725145,stcules
4,Cauldron DIPA,64883,1075,7.7,American Double / Imperial IPA,4.0,4.5,4.0,4.5,4.0,1293735206,johnmichaelsen


In [34]:
df_beer.tail()

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name
1586182,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,4.0,4.5,5.0,5.0,5.0,1162871808,blitheringidiot
1586183,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,4.5,5.0,5.0,4.5,5.0,1162865640,PopeDX
1586184,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,4.0,4.0,3.5,3.0,3.5,1162685856,treehugger02010
1586185,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,3.5,4.0,4.0,4.0,5.0,1162684892,maddogruss
1586186,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,2.5,5.0,2.0,4.0,4.0,1161048566,yelterdow


In [35]:
df_beer.shape

(1586187, 12)

### 'Text' column does not fit because it has less rows:
- Create the text.csv file
- Make a function to get differences between indexes
- Get the rows which they do not have difference = 13
- Eliminate the rows in the whole dataframe

In [36]:
text = beer[beer['columnas'] == 'review/text']

In [37]:
text.to_csv ('text.csv')

In [38]:
text = pd.read_csv ("./text.csv", sep = ',')

In [39]:
text.head()

Unnamed: 0.1,Unnamed: 0,0,columnas,filas
0,12,"review/text,"" A lot of foam. But a lot.\tIn th...",review/text,""" A lot of foam. But a lot.\tIn the smell some..."
1,25,"review/text,"" Dark red color, light beige foam...",review/text,""" Dark red color, light beige foam, average.\t..."
2,38,"review/text,"" Almost totally black. Beige foam...",review/text,""" Almost totally black. Beige foam, quite comp..."
3,51,"review/text,"" Golden yellow color. White, comp...",review/text,""" Golden yellow color. White, compact foam, qu..."
4,64,"review/text,"" According to the website, the st...",review/text,""" According to the website, the style for the ..."


In [40]:
text.shape

(1586175, 4)

In [41]:
text.rename (columns = {
    'Unnamed: 0' : 'index'
},
            inplace = True)

In [42]:
text.head()

Unnamed: 0,index,0,columnas,filas
0,12,"review/text,"" A lot of foam. But a lot.\tIn th...",review/text,""" A lot of foam. But a lot.\tIn the smell some..."
1,25,"review/text,"" Dark red color, light beige foam...",review/text,""" Dark red color, light beige foam, average.\t..."
2,38,"review/text,"" Almost totally black. Beige foam...",review/text,""" Almost totally black. Beige foam, quite comp..."
3,51,"review/text,"" Golden yellow color. White, comp...",review/text,""" Golden yellow color. White, compact foam, qu..."
4,64,"review/text,"" According to the website, the st...",review/text,""" According to the website, the style for the ..."


In [43]:
index = text['index']

In [44]:
index.head()

0    12
1    25
2    38
3    51
4    64
Name: index, dtype: int64

In [45]:
index.tolist()[:10]

[12, 25, 38, 51, 64, 77, 90, 103, 116, 129]

### Function

In [46]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return izip(a, b)

In [47]:
diff = [y-x for x,y in pairwise(index)]
diff[:10]

[13, 13, 13, 13, 13, 13, 13, 13, 13, 13]

### Differences at wrong rows

In [48]:
wrong_rows = []
for i in diff:
    if i != 13:
        wrong_rows.append (i)
print wrong_rows

[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25]


### Make a dataframe

In [49]:
df_diff = pd.DataFrame (np.array(diff))
df_diff = df_diff.astype (int)

In [50]:
df_diff.head()

Unnamed: 0,0
0,13
1,13
2,13
3,13
4,13


In [51]:
df_index = pd.DataFrame (np.array(index))

In [52]:
df_index.head()

Unnamed: 0,0
0,12
1,25
2,38
3,51
4,64


In [53]:
df_miss = pd.concat ([
    df_index,
    df_diff
    
],
axis=1, join = 'outer')

In [54]:
df_miss.columns = ['index', 'diff']

In [55]:
df_miss.head()

Unnamed: 0,index,diff
0,12,13.0
1,25,13.0
2,38,13.0
3,51,13.0
4,64,13.0


### Get the indexes in which difference !=13 from the original dataset

In [56]:
df_miss [df_miss['diff'] != 13.0]

Unnamed: 0,index,diff
124969,1624609,25.0
246054,3198726,25.0
415676,5403824,25.0
416375,5412923,25.0
775388,10080104,25.0
1097093,14262281,25.0
1177000,15301084,25.0
1246163,16200215,25.0
1246336,16202476,25.0
1378825,17924845,25.0


### Check the rows to fit

In [57]:
text = beer[beer['columnas'] == 'review/text']
text.rename(columns={
    'filas' : 'text'
}
              ,inplace = True)
text= text['text']

In [58]:
text.head()

12    " A lot of foam. But a lot.\tIn the smell some...
25    " Dark red color, light beige foam, average.\t...
38    " Almost totally black. Beige foam, quite comp...
51    " Golden yellow color. White, compact foam, qu...
64    " According to the website, the style for the ...
Name: text, dtype: object

In [59]:
text[124969:124974]

1624609    " Chesnut brown with an even layer of head. La...
1624634    " Rusty amber with caramel brown highlights. S...
1624647    " Pours solid brown and clear with a lace form...
1624660    " 330ml slim brown bottle dated best-before 1/...
1624673    " Served from a 11.2 oz bottle into a tulip gl...
Name: text, dtype: object

In [60]:
text [246054: 246060]

3198726    " Growler fill from the always fantastic Andy'...
3198751    " Beer pours a dark, dark brown color with a m...
3198764    " 1/30/2005\t$1.25\t12 oz.\t9.50%\tBest by May...
3198777    " I must say, I enjoyed this a bit more than m...
3198790    " The beer poured into the glass blackish brow...
3198803    " Whoa! Grabbed this &lt;winter wheat&gt; to e...
Name: text, dtype: object

In [61]:
df_beer[124969:124972]

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name
124969,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.5,3.5,3,3.5,3.5,1176421694,malcontent
124970,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.0,4.0,4,4.0,4.5,1176355712,msubulldog25
124971,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.0,4.5,4,4.5,4.5,1164939613,alexgash


In [62]:
df_beer[246054:246057]

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name
246054,Ramstein Winter Wheat,2743,607,9.5,Weizenbock,4.5,4.5,4.5,4.5,4.5,1165784289,jwc215
246055,Ramstein Winter Wheat,2743,607,9.5,Weizenbock,4.5,4.5,3.5,4.0,4.0,1164946415,yelterdow
246056,Ramstein Winter Wheat,2743,607,9.5,Weizenbock,3.5,4.5,4.0,4.5,4.5,1159298456,msubulldog25


### Drop the rows

In [63]:
df_beer=df_beer.drop(df_beer.index[[124970,246055,
                                    415677,416376,
                                    775389,1097094,
                                    1177001,1246164,
                                    1246337,1378826,
                                    1386778,1547383]])

In [64]:
df_beer.shape

(1586175, 12)

In [65]:
df_beer[124969:124972]

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name
124969,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.5,3.5,3.0,3.5,3.5,1176421694,malcontent
124971,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.0,4.5,4.0,4.5,4.5,1164939613,alexgash
124972,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.5,5.0,4.5,5.0,5.0,1164498691,nortmand


### New index lengh

In [66]:
col_2 = pd.Series (range(0,1586175))
df_beer.index = col_2

In [67]:
df_beer[124969:124972]

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name
124969,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.5,3.5,3.0,3.5,3.5,1176421694,malcontent
124970,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.0,4.5,4.0,4.5,4.5,1164939613,alexgash
124971,Vichtenaar,10643,641,5.1,Flanders Red Ale,4.5,5.0,4.5,5.0,5.0,1164498691,nortmand


In [68]:
text.index = col_2

In [69]:
text.tail()

1586170    " Never has a beer struck me in such a way tha...
1586171    " What a fantastic pumpkin beer! Probably my m...
1586172    " My buddy got a growler from the Defiant Brew...
1586173    " If you're looking for a quality beer right a...
1586174    " Growler fill from the long-awaited Defiant B...
Name: text, dtype: object

In [70]:
text.head()

0    " A lot of foam. But a lot.\tIn the smell some...
1    " Dark red color, light beige foam, average.\t...
2    " Almost totally black. Beige foam, quite comp...
3    " Golden yellow color. White, compact foam, qu...
4    " According to the website, the style for the ...
Name: text, dtype: object

### Adding 'text' column 

In [71]:
df_beer['text'] = text

In [72]:
df_beer.tail()

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name,text
1586170,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,4.0,4.5,5.0,5.0,5.0,1162871808,blitheringidiot,""" Never has a beer struck me in such a way tha..."
1586171,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,4.5,5.0,5.0,4.5,5.0,1162865640,PopeDX,""" What a fantastic pumpkin beer! Probably my m..."
1586172,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,4.0,4.0,3.5,3.0,3.5,1162685856,treehugger02010,""" My buddy got a growler from the Defiant Brew..."
1586173,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,3.5,4.0,4.0,4.0,5.0,1162684892,maddogruss,""" If you're looking for a quality beer right a..."
1586174,The Horseman's Ale,33061,14359,5.2,Pumpkin Ale,2.5,5.0,2.0,4.0,4.0,1161048566,yelterdow,""" Growler fill from the long-awaited Defiant B..."


In [73]:
df_beer.head()

Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name,text
0,Sausa Weizen,47986,10325,5.0,Hefeweizen,2.5,2.0,1.5,1.5,1.5,1234817823,stcules,""" A lot of foam. But a lot.\tIn the smell some..."
1,Red Moon,48213,10325,6.2,English Strong Ale,3.0,2.5,3.0,3.0,3.0,1235915097,stcules,""" Dark red color, light beige foam, average.\t..."
2,Black Horse Black Beer,48215,10325,6.5,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,1235916604,stcules,""" Almost totally black. Beige foam, quite comp..."
3,Sausa Pils,47969,10325,5.0,German Pilsener,3.5,3.0,2.5,3.0,3.0,1234725145,stcules,""" Golden yellow color. White, compact foam, qu..."
4,Cauldron DIPA,64883,1075,7.7,American Double / Imperial IPA,4.0,4.5,4.0,4.5,4.0,1293735206,johnmichaelsen,""" According to the website, the style for the ..."


In [74]:
df_beer.to_csv('df_before_dates.csv')

### Change the time format to get dates

In [75]:
beer = pd.read_csv('df_beers.csv')

In [76]:
beer['time'].head(2)

0    1234817823
1    1235915097
Name: time, dtype: int64

In [77]:
def convert_stamp_to_date(stamp):
    try:
        d = datetime.datetime.utcfromtimestamp(stamp)
    except:
        d = datetime.datetime.utcfromtimestamp(0)
    d = datetime.datetime(d.year, d.month, d.day)
    return d

In [78]:
beer['time2'] = beer['time'].apply(lambda x: convert_stamp_to_date(x))

In [79]:
beer['time2'].head() 

0   2009-02-16
1   2009-03-01
2   2009-03-01
3   2009-02-15
4   2010-12-30
Name: time2, dtype: datetime64[ns]

In [80]:
beer.head()

Unnamed: 0.1,Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name,text,time2,day,month,year,user_id
0,0,Sausa Weizen,47986,10325,5.0,Hefeweizen,2.5,2.0,1.5,1.5,1.5,1234817823,stcules,""" A lot of foam. But a lot.\tIn the smell some...",2009-02-16,1,1,1970,30561
1,1,Red Moon,48213,10325,6.2,English Strong Ale,3.0,2.5,3.0,3.0,3.0,1235915097,stcules,""" Dark red color, light beige foam, average.\t...",2009-03-01,1,1,1970,30561
2,2,Black Horse Black Beer,48215,10325,6.5,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,1235916604,stcules,""" Almost totally black. Beige foam, quite comp...",2009-03-01,1,1,1970,30561
3,3,Sausa Pils,47969,10325,5.0,German Pilsener,3.5,3.0,2.5,3.0,3.0,1234725145,stcules,""" Golden yellow color. White, compact foam, qu...",2009-02-15,1,1,1970,30561
4,4,Cauldron DIPA,64883,1075,7.7,American Double / Imperial IPA,4.0,4.5,4.0,4.5,4.0,1293735206,johnmichaelsen,""" According to the website, the style for the ...",2010-12-30,1,1,1970,23004


### Split in Year, Month, Day

In [81]:
beer["day"] = beer['time2'].map(lambda x: x.day)
beer["month"] = beer['time2'].map(lambda x: x.month)
beer["year"] = beer['time2'].map(lambda x: x.year)

In [82]:
beer.head()

Unnamed: 0.1,Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name,text,time2,day,month,year,user_id
0,0,Sausa Weizen,47986,10325,5.0,Hefeweizen,2.5,2.0,1.5,1.5,1.5,1234817823,stcules,""" A lot of foam. But a lot.\tIn the smell some...",2009-02-16,16,2,2009,30561
1,1,Red Moon,48213,10325,6.2,English Strong Ale,3.0,2.5,3.0,3.0,3.0,1235915097,stcules,""" Dark red color, light beige foam, average.\t...",2009-03-01,1,3,2009,30561
2,2,Black Horse Black Beer,48215,10325,6.5,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,1235916604,stcules,""" Almost totally black. Beige foam, quite comp...",2009-03-01,1,3,2009,30561
3,3,Sausa Pils,47969,10325,5.0,German Pilsener,3.5,3.0,2.5,3.0,3.0,1234725145,stcules,""" Golden yellow color. White, compact foam, qu...",2009-02-15,15,2,2009,30561
4,4,Cauldron DIPA,64883,1075,7.7,American Double / Imperial IPA,4.0,4.5,4.0,4.5,4.0,1293735206,johnmichaelsen,""" According to the website, the style for the ...",2010-12-30,30,12,2010,23004


### eliminate NAN

In [83]:
beer=beer.replace(' ', np.nan)

In [84]:
beer.columns[beer.isnull().any()].tolist() 

[]

### Is there any column with missing values?

In [85]:
beer.columns[beer.isnull().any()].tolist() 

[]

### How many missing values are in each of those columns?

In [86]:
pd.isnull(beer).sum()

Unnamed: 0      0
names           0
Id              0
brewerId        0
ABV             0
style           0
appearance      0
aroma           0
palate          0
taste           0
overall         0
time            0
profile_name    0
text            0
time2           0
day             0
month           0
year            0
user_id         0
dtype: int64

### How to proceed with the missing values?
1.We are going to delete the profile name rows with missing values, since if we do not know which consumer it is, the associated data to that rows are not useful for our  recommendation system   
2.The missing  ABV  values will be replace for a "99" which is easy to identify and is not going to mixed up with the real values   
3.Lastly, the missing text values are going to be recoded as "no comments"

In [87]:
beer.ABV=beer.ABV.fillna(value=99)
beer.ABV.isnull().sum()

0

In [88]:
beer.text=beer.text.fillna(value="no comments")
beer.text.isnull().sum()

0

In [89]:
beer=beer.dropna(axis=0)

In [90]:
beer.columns[beer.isnull().any()].tolist() 

[]

In [91]:
beer.head(2)

Unnamed: 0.1,Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name,text,time2,day,month,year,user_id
0,0,Sausa Weizen,47986,10325,5.0,Hefeweizen,2.5,2.0,1.5,1.5,1.5,1234817823,stcules,""" A lot of foam. But a lot.\tIn the smell some...",2009-02-16,16,2,2009,30561
1,1,Red Moon,48213,10325,6.2,English Strong Ale,3.0,2.5,3.0,3.0,3.0,1235915097,stcules,""" Dark red color, light beige foam, average.\t...",2009-03-01,1,3,2009,30561


In [92]:
beer.shape

(1585827, 19)

### we add a user Id column

In [93]:
beer['user_id'] = LabelEncoder().fit_transform(beer.profile_name)

In [94]:
beer.shape

(1585827, 19)

In [95]:
beer.head()

Unnamed: 0.1,Unnamed: 0,names,Id,brewerId,ABV,style,appearance,aroma,palate,taste,overall,time,profile_name,text,time2,day,month,year,user_id
0,0,Sausa Weizen,47986,10325,5.0,Hefeweizen,2.5,2.0,1.5,1.5,1.5,1234817823,stcules,""" A lot of foam. But a lot.\tIn the smell some...",2009-02-16,16,2,2009,30561
1,1,Red Moon,48213,10325,6.2,English Strong Ale,3.0,2.5,3.0,3.0,3.0,1235915097,stcules,""" Dark red color, light beige foam, average.\t...",2009-03-01,1,3,2009,30561
2,2,Black Horse Black Beer,48215,10325,6.5,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,1235916604,stcules,""" Almost totally black. Beige foam, quite comp...",2009-03-01,1,3,2009,30561
3,3,Sausa Pils,47969,10325,5.0,German Pilsener,3.5,3.0,2.5,3.0,3.0,1234725145,stcules,""" Golden yellow color. White, compact foam, qu...",2009-02-15,15,2,2009,30561
4,4,Cauldron DIPA,64883,1075,7.7,American Double / Imperial IPA,4.0,4.5,4.0,4.5,4.0,1293735206,johnmichaelsen,""" According to the website, the style for the ...",2010-12-30,30,12,2010,23004


In [96]:
beer.to_csv ('df_beers.csv')