In [1]:
import pandas as pd
import numpy as np

In [2]:
train_raw = pd.read_csv('train.csv')
test_raw  = pd.read_csv('test.csv')

train_raw

Unnamed: 0.1,Unnamed: 0,Species,Farm.Name,Lot.Number,Altitude,Number.of.Bags,Bag.Weight,Harvest.Year,Grading.Date,Variety,...,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Color,Category.Two.Defects,Expiration,Country.of.Origin
0,0,Arabica,conquista / morito,,,250,1 kg,2012,"January 13th, 2012",Bourbon,...,10.0,7.00,78.33,0.11,0,0.0,Green,9,"January 12th, 2013",2
1,1,Arabica,,,de 1.600 a 1.950 msn,275,70 kg,Mayo a Julio,"July 29th, 2011",Caturra,...,10.0,8.17,83.08,0.01,0,0.0,,0,"July 28th, 2012",1
2,2,Arabica,la esmeralda,11/23/0634,4000,25,69 kg,2017,"September 8th, 2017",Bourbon,...,10.0,7.50,82.58,0.10,0,1.0,Green,2,"September 8th, 2018",2
3,3,Arabica,fazenda santo antonio,,900-1100,305,2 kg,2014,"February 13th, 2015",Catuai,...,10.0,7.58,83.00,0.00,0,0.0,Green,0,"February 13th, 2016",0
4,4,Arabica,,,,1,5 lbs,2013,"September 12th, 2014",,...,10.0,7.25,82.08,0.11,1,0.0,Green,0,"September 12th, 2015",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,580,Arabica,,,1800,275,2 kg,2015/2016,"June 5th, 2015",Other,...,10.0,7.83,84.25,0.12,0,0.0,Green,0,"June 4th, 2016",1
581,581,Arabica,cafetal,101,1300,200,69 kg,2017,"July 3rd, 2017",Mundo Novo,...,10.0,7.83,83.92,0.11,0,0.0,Green,5,"July 3rd, 2018",3
582,582,Arabica,las lomas,,1200,250,1 kg,2012,"July 11th, 2012",Bourbon,...,10.0,7.67,83.00,0.11,5,0.0,Green,8,"July 11th, 2013",3
583,583,Arabica,,,1800 msnm,250,70 kg,4T/10,"February 9th, 2011",,...,10.0,7.42,82.17,0.08,0,0.0,,0,"February 9th, 2012",1


In [3]:
train_raw.isna().sum(), test_raw.isna().sum()

(Unnamed: 0                0
 Species                   0
 Farm.Name               166
 Lot.Number              480
 Altitude                 67
 Number.of.Bags            0
 Bag.Weight                0
 Harvest.Year             10
 Grading.Date              0
 Variety                  61
 Processing.Method        52
 Aroma                     0
 Flavor                    0
 Aftertaste                0
 Acidity                   0
 Body                      0
 Balance                   0
 Uniformity                0
 Clean.Cup                 0
 Sweetness                 0
 Cupper.Points             0
 Total.Cup.Points          0
 Moisture                  0
 Category.One.Defects      0
 Quakers                   1
 Color                   116
 Category.Two.Defects      0
 Expiration                0
 Country.of.Origin         0
 dtype: int64,
 Unnamed: 0                0
 Species                   0
 Farm.Name                49
 Lot.Number              118
 Altitude                 20

### Drop Columns

In [4]:
train_raw.columns

Index(['Unnamed: 0', 'Species', 'Farm.Name', 'Lot.Number', 'Altitude',
       'Number.of.Bags', 'Bag.Weight', 'Harvest.Year', 'Grading.Date',
       'Variety', 'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste',
       'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness',
       'Cupper.Points', 'Total.Cup.Points', 'Moisture', 'Category.One.Defects',
       'Quakers', 'Color', 'Category.Two.Defects', 'Expiration',
       'Country.of.Origin'],
      dtype='object')

In [5]:
drop_cols = ['Farm.Name','Lot.Number','Harvest.Year','Species', 'Number.of.Bags','Bag.Weight','Grading.Date','Expiration','Unnamed: 0']    
#'Altitude' , 'Variety','Processing.Method','Color','Quakers'
drop_cols += []     # meaningless

train = train_raw.drop(columns=drop_cols)
test = test_raw.drop(columns=drop_cols)
train['Quakers'].fillna(0)
test['Quakers'].fillna(0)

0      0
1      0
2      0
3      0
4      0
      ..
142    0
143    0
144    0
145    0
146    0
Name: Quakers, Length: 147, dtype: int64

### Preprocessing

#### Altitude

In [6]:
import re

altitudes = train_raw['Altitude'].to_numpy()


def preprocess_altitudes(altitudes):
    lst = []
    for altitude in altitudes:
        altitude = str(altitude)
        if altitude == 'nan' : lst.append(0)
        else:
        # Remove chars except 0-9 and -
            cleaned_altitude = re.sub(r'[^0-9\-]+', '', altitude)
            
            # Handle ranges
            if '-' in cleaned_altitude:
                start, end = map(int, cleaned_altitude.split('-'))
                average = (start + end) / 2
                lst.append(average)

            elif len(cleaned_altitude)>=7:
                start, end = int(cleaned_altitude)/10000, int(cleaned_altitude)%10000
                average = (start + end) / 2
                lst.append(average)

            else:
                try:
                    lst.append(int(cleaned_altitude))
                except:
                    lst.append(0)
        
    return lst

preprocessed_altitudes = preprocess_altitudes(altitudes)
print(preprocessed_altitudes)


[0, 1775.0974999999999, 4000, 1000.0, 0, 1750, 4300, 12, 1300, 1750, 1700, 1400, 1750, 774, 442, 905, 890, 4000, 442, 1300, 1775.0974999999999, 1400, 1261, 1100, 1, 1100, 995, 1200, 1600, 1350, 1000, 0, 3280, 1500, 1500, 4540, 0, 1000, 1800, 0, 0, 0, 1550, 1750, 1100, 1450.0, 0, 1775.0974999999999, 1800, 1600, 1, 1775.0974999999999, 1500, 0, 1750, 0, 439, 890, 1200, 1300, 1700, 982, 695, 1450, 1483, 0, 1100, 1170, 1800, 1000, 0, 1750, 700, 1901, 4000, 950, 1000, 5000, 1100, 1200, 1400, 1000, 1800, 4300, 1700, 900, 1700, 950, 1200, 905, 0, 1250, 1750, 1200, 1100, 1100, 0, 1250, 1450, 1650, 1210, 5000, 1600, 1775.0974999999999, 1750.095, 1550, 4000, 1000, 4300, 1550, 1100, 1400, 4000, 1300, 940, 1400, 1350, 1550.0900000000001, 900, 1770, 1400.0, 442, 0, 0, 1250, 4300, 1775.0974999999999, 1300, 1150, 0, 1400, 0, 1800, 1600, 0, 1250, 1400.075, 1750, 940, 1775.0974999999999, 3850.295, 1250, 0, 1500, 442, 4000, 1500, 5000, 1450, 1300, 1800, 3607, 1150, 4000, 0, 1750, 4300, 1250, 2527, 4300, 

In [7]:
mean_val = np.mean(preprocessed_altitudes)
preprocessed_altitudes = [val if val != 0 else mean_val for val in preprocessed_altitudes]

train['Altitude'] = preprocessed_altitudes
train


Unnamed: 0,Altitude,Variety,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Color,Category.Two.Defects,Country.of.Origin
0,2235.355303,Bourbon,Washed / Wet,7.17,6.08,6.17,7.25,7.33,7.33,10.0,10.0,10.0,7.00,78.33,0.11,0,0.0,Green,9,2
1,1775.097500,Caturra,,7.33,7.33,7.33,7.58,7.25,8.08,10.0,10.0,10.0,8.17,83.08,0.01,0,0.0,,0,1
2,4000.000000,Bourbon,Washed / Wet,7.50,7.42,7.42,7.75,7.50,7.50,10.0,10.0,10.0,7.50,82.58,0.10,0,1.0,Green,2,2
3,1000.000000,Catuai,Natural / Dry,7.67,7.67,7.67,7.50,7.42,7.50,10.0,10.0,10.0,7.58,83.00,0.00,0,0.0,Green,0,0
4,2235.355303,,Washed / Wet,7.50,7.42,7.42,7.42,7.67,7.42,10.0,10.0,10.0,7.25,82.08,0.11,1,0.0,Green,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,1800.000000,Other,Washed / Wet,7.83,7.75,7.83,7.58,7.67,7.75,10.0,10.0,10.0,7.83,84.25,0.12,0,0.0,Green,0,1
581,1300.000000,Mundo Novo,Washed / Wet,7.83,7.83,7.58,7.92,7.42,7.50,10.0,10.0,10.0,7.83,83.92,0.11,0,0.0,Green,5,3
582,1200.000000,Bourbon,Washed / Wet,7.50,7.67,7.50,7.50,7.58,7.58,10.0,10.0,10.0,7.67,83.00,0.11,5,0.0,Green,8,3
583,1800.000000,,,7.58,7.67,7.42,7.17,7.17,7.75,10.0,10.0,10.0,7.42,82.17,0.08,0,0.0,,0,1


In [8]:
import re

altitudes = test_raw['Altitude'].to_numpy()


def preprocess_altitudes(altitudes):
    lst = []
    for altitude in altitudes:
        altitude = str(altitude)
        if altitude == 'nan' : lst.append(0)
        else:
        # Remove chars except 0-9 and -
            cleaned_altitude = re.sub(r'[^0-9\-]+', '', altitude)
            
            # Handle ranges
            if '-' in cleaned_altitude:
                start, end = map(int, cleaned_altitude.split('-'))
                average = (start + end) / 2
                lst.append(average)

            elif len(cleaned_altitude)>=7:
                start, end = int(cleaned_altitude)/10000, int(cleaned_altitude)%10000
                average = (start + end) / 2
                lst.append(average)

            else:
                try:
                    lst.append(int(cleaned_altitude))
                except:
                    lst.append(0)
        
    return lst

preprocessed_altitudes = preprocess_altitudes(altitudes)
print(preprocessed_altitudes)

[1250, 1600, 1775.0974999999999, 1020, 975, 3280, 0, 1100, 800, 1600, 0, 1679, 2136, 1200, 2560, 1700, 442, 442, 4600, 1350, 4300, 1170, 3280, 4550.0, 0, 700, 1565, 1650, 1100, 1700, 1250, 1900.0, 1550, 1450, 1250, 1000, 808, 5600.0, 0, 1800, 0, 1144, 1550, 3702, 1, 0, 1200, 1100, 1150, 1050, 0, 1260, 1650, 1500, 1775.0974999999999, 0, 1000, 1, 4300, 0, 1, 5000, 1600, 4300, 0, 1200, 1650, 1750, 1275.0675, 1022, 1400, 4300, 0, 0, 934, 1450, 1700, 1248, 1300, 1200, 1800, 0, 1500, 1320, 0, 900, 1000, 1296, 1750, 4300, 1200, 1000, 1500, 1500, 1000, 4000, 894, 1, 1775.0974999999999, 1750, 5500, 1775.0974999999999, 1100, 1150, 0, 1100, 1775.0974999999999, 1600.0, 0, 1200, 1880, 4000, 1775.0, 1100, 1900.0, 1500, 0, 1400, 0, 1560, 944, 1625.0925, 1170, 1320, 950, 5000, 1650, 165, 442, 1200, 1250, 1800, 1050, 1280, 1775.0974999999999, 0, 1200, 1, 1775.0974999999999, 1300, 0, 1750, 1775.0974999999999, 0, 1800, 1100, 1750]


In [9]:
mean_val = np.mean(preprocessed_altitudes)
preprocessed_altitudes = [val if val != 0 else mean_val for val in preprocessed_altitudes]
test['Altitude'] = preprocessed_altitudes
test

Unnamed: 0,Altitude,Variety,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Color,Category.Two.Defects
0,1250.000000,Typica,Washed / Wet,7.33,7.08,6.92,7.42,7.25,7.08,10.0,10.0,10.0,7.08,80.17,0.11,0,0,Green,2
1,1600.000000,Bourbon,Washed / Wet,7.75,7.83,7.58,8.00,7.92,7.75,10.0,10.0,10.0,7.83,84.67,0.10,0,0,Green,1
2,1775.097500,Caturra,Washed / Wet,7.83,7.50,7.50,7.42,7.50,7.50,10.0,10.0,10.0,7.58,82.83,0.11,0,0,Green,3
3,1020.000000,Catuai,Natural / Dry,7.42,7.33,7.33,7.50,7.50,7.33,10.0,10.0,10.0,7.50,81.92,0.11,0,0,Green,4
4,975.000000,Caturra,Washed / Wet,7.33,7.17,7.00,6.75,7.08,7.00,10.0,10.0,10.0,7.33,79.67,0.13,5,0,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,1775.097500,Caturra,Washed / Wet,7.83,7.67,7.75,7.58,7.92,7.75,10.0,10.0,10.0,7.75,84.25,0.12,4,0,Green,2
143,1447.067619,,,7.58,7.58,7.58,7.67,7.75,7.67,10.0,10.0,10.0,7.58,83.42,0.10,1,0,,3
144,1800.000000,Caturra,Washed / Wet,7.75,7.58,7.67,7.25,7.58,7.42,10.0,10.0,10.0,7.42,82.67,0.11,0,0,Bluish-Green,3
145,1100.000000,Catuai,Semi-washed / Semi-pulped,7.83,7.75,7.50,7.75,7.50,7.58,10.0,10.0,10.0,7.67,83.58,0.12,0,0,Blue-Green,2


#### variety

In [10]:
variety = train['Variety']
variety.fillna('Other',inplace=True)

In [11]:
variety = test['Variety']
variety.fillna('Other',inplace=True)

#### Processing.Method

In [12]:
P_method = train['Processing.Method']
P_method.fillna('Other',inplace=True)

In [13]:
P_method = test['Processing.Method']
P_method.fillna('Other',inplace=True)

#### color

In [14]:
color = train['Color']
color.fillna('Other',inplace=True)

In [15]:
color = test['Color']
color.fillna('Other',inplace=True)

In [16]:
train.isna().sum().to_numpy().all(),test.isna().sum().to_numpy().all()

(False, False)

In [17]:
train.columns

Index(['Altitude', 'Variety', 'Processing.Method', 'Aroma', 'Flavor',
       'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean.Cup',
       'Sweetness', 'Cupper.Points', 'Total.Cup.Points', 'Moisture',
       'Category.One.Defects', 'Quakers', 'Color', 'Category.Two.Defects',
       'Country.of.Origin'],
      dtype='object')

In [18]:
test.columns

Index(['Altitude', 'Variety', 'Processing.Method', 'Aroma', 'Flavor',
       'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean.Cup',
       'Sweetness', 'Cupper.Points', 'Total.Cup.Points', 'Moisture',
       'Category.One.Defects', 'Quakers', 'Color', 'Category.Two.Defects'],
      dtype='object')

#### One-Hot Encoding

In [19]:
one_hot_train = pd.get_dummies(train)
one_hot_test = pd.get_dummies(test)

In [20]:
one_hot_train.corr()['Country.of.Origin']


Altitude                                       0.018615
Aroma                                         -0.170782
Flavor                                        -0.261109
Aftertaste                                    -0.310908
Acidity                                       -0.114522
Body                                          -0.290722
Balance                                       -0.316077
Uniformity                                    -0.100379
Clean.Cup                                     -0.039066
Sweetness                                      0.000975
Cupper.Points                                 -0.302856
Total.Cup.Points                              -0.258803
Moisture                                       0.331794
Category.One.Defects                           0.132010
Quakers                                       -0.139952
Category.Two.Defects                           0.250217
Country.of.Origin                              1.000000
Variety_Arusha                                -0

In [22]:
one_hot_train.to_csv('preprocessed_train.csv')
one_hot_test.to_csv('preprocessed_test.csv')