In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import ydata_profiling

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_excel("train1.xlsx")

In [8]:
# run same process to clear test data
# df=pd.read_excel("test1.xlsx")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Area_Extent,Perimeter,Major_Minor_Axis_Length,Eccentricity,Convex_Area,outcome
0,1,0,10729:0.794211268424988,414699000000000.0,171.729873657227_81.4269790649414,0.88044,10987,1
1,2,1,11732_0.637470126152039,441123000000000.0,176.483489990234;86.1924667358398,0.872626,12076,1
2,3,2,"nine thousand, two hundred and twenty-one_0.76...",387094000000000.0,159.224502563477_74.9033279418945,0.882439,9418,1
3,4,3,"eleven thousand, seven hundred and eighty-seve...",434514000000000.0,181.290466308594_83.3493347167969,0.888045,11965,1
4,5,4,12574_0.654725313186646,444631000000000.0,"174.560256958008,93.2424011230469",0.845386,12964,1


### Features:
* id: Unique ID of the rice sample
* Area Extent: Number of pixels within the boundaries of the rice grain and the ratio of the region formed by the rice grain to the bounding box pixels. separated by a delimiter
* Perimeter: Circumference around the boundaries of the rice grain
* Major Minor Axis Length: The main and small axis lengths, separated by a delimiter
* Eccentricity: Eccentricity of the rice grain
* Convex Area: Pixel count of the smallest convex Shell of the region formed by the rice grain
* outcome: The type of rice (1: Osmancik - 0: Cammeo)

In [10]:
df.shape

(3000, 8)

In [11]:
df.isna().sum()

Unnamed: 0                 0
id                         0
Area_Extent                0
Perimeter                  0
Major_Minor_Axis_Length    0
Eccentricity               0
Convex_Area                0
outcome                    0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               3000 non-null   int64  
 1   id                       3000 non-null   int64  
 2   Area_Extent              3000 non-null   object 
 3   Perimeter                3000 non-null   float64
 4   Major_Minor_Axis_Length  3000 non-null   object 
 5   Eccentricity             3000 non-null   float64
 6   Convex_Area              3000 non-null   int64  
 7   outcome                  3000 non-null   int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 187.6+ KB


# Cleaning the data

In [13]:
df["Major_Axis_Length"]=df["Major_Minor_Axis_Length"].apply(lambda x: x.split(",")[0])
df["Major_Axis_Length"]=df["Major_Axis_Length"].apply(lambda x: x.split(";")[0])
df["Major_Axis_Length"]=df["Major_Axis_Length"].apply(lambda x: x.split("_")[0])

In [14]:
df["Minor_Axis_Length"]=df["Major_Minor_Axis_Length"].apply(lambda x: x.split("_")[~0])
df["Minor_Axis_Length"]=df["Minor_Axis_Length"].apply(lambda x: x.split(";")[~0])
df["Minor_Axis_Length"]=df["Minor_Axis_Length"].apply(lambda x: x.split(",")[~0])

In [15]:
df.drop("Major_Minor_Axis_Length",1, inplace=True)

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Area_Extent,Perimeter,Eccentricity,Convex_Area,outcome,Major_Axis_Length,Minor_Axis_Length
0,1,0,10729:0.794211268424988,414699000000000.0,0.88044,10987,1,171.729873657227,81.4269790649414
1,2,1,11732_0.637470126152039,441123000000000.0,0.872626,12076,1,176.483489990234,86.1924667358398
2,3,2,"nine thousand, two hundred and twenty-one_0.76...",387094000000000.0,0.882439,9418,1,159.224502563477,74.9033279418945
3,4,3,"eleven thousand, seven hundred and eighty-seve...",434514000000000.0,0.888045,11965,1,181.290466308594,83.3493347167969
4,5,4,12574_0.654725313186646,444631000000000.0,0.845386,12964,1,174.560256958008,93.2424011230469


In [17]:
df["Number_of_pixels"]=df["Area_Extent"].apply(lambda x: x.split(":")[0])
df["Number_of_pixels"]=df["Number_of_pixels"].apply(lambda x: x.split("_")[0])
df["Number_of_pixels"]=df["Number_of_pixels"].apply(lambda x: x.split(";")[0])

In [18]:
df["ratio_of_reagion"]=df["Area_Extent"].apply(lambda x: x.split(":")[~0])
df["ratio_of_reagion"]=df["ratio_of_reagion"].apply(lambda x: x.split("_")[~0])
df["ratio_of_reagion"]=df["ratio_of_reagion"].apply(lambda x: x.split(";")[~0])

In [19]:
df.drop("Area_Extent",1,inplace=True)

In [20]:
df.drop("Unnamed: 0",1,inplace=True)

In [21]:
df.drop("id",1,inplace=True)

In [22]:
df.head()

Unnamed: 0,Perimeter,Eccentricity,Convex_Area,outcome,Major_Axis_Length,Minor_Axis_Length,Number_of_pixels,ratio_of_reagion
0,414699000000000.0,0.88044,10987,1,171.729873657227,81.4269790649414,10729,0.794211268424988
1,441123000000000.0,0.872626,12076,1,176.483489990234,86.1924667358398,11732,0.637470126152039
2,387094000000000.0,0.882439,9418,1,159.224502563477,74.9033279418945,"nine thousand, two hundred and twenty-one",0.7684166431427
3,434514000000000.0,0.888045,11965,1,181.290466308594,83.3493347167969,"eleven thousand, seven hundred and eighty-seven",0.74412876367569
4,444631000000000.0,0.845386,12964,1,174.560256958008,93.2424011230469,12574,0.654725313186646


In [23]:
df.columns

Index(['Perimeter', 'Eccentricity', 'Convex_Area', 'outcome',
       'Major_Axis_Length', 'Minor_Axis_Length', 'Number_of_pixels',
       'ratio_of_reagion'],
      dtype='object')

In [24]:
df=df.reindex(columns=['Perimeter', 'Eccentricity', 'Convex_Area',
       'Major_Axis_Length', 'Minor_Axis_Length', 'Number_of_pixels',
       'ratio_of_reagion','outcome'])

In [25]:
df.head()

Unnamed: 0,Perimeter,Eccentricity,Convex_Area,Major_Axis_Length,Minor_Axis_Length,Number_of_pixels,ratio_of_reagion,outcome
0,414699000000000.0,0.88044,10987,171.729873657227,81.4269790649414,10729,0.794211268424988,1
1,441123000000000.0,0.872626,12076,176.483489990234,86.1924667358398,11732,0.637470126152039,1
2,387094000000000.0,0.882439,9418,159.224502563477,74.9033279418945,"nine thousand, two hundred and twenty-one",0.7684166431427,1
3,434514000000000.0,0.888045,11965,181.290466308594,83.3493347167969,"eleven thousand, seven hundred and eighty-seven",0.74412876367569,1
4,444631000000000.0,0.845386,12964,174.560256958008,93.2424011230469,12574,0.654725313186646,1


In [26]:
df.dtypes

Perimeter            float64
Eccentricity         float64
Convex_Area            int64
Major_Axis_Length     object
Minor_Axis_Length     object
Number_of_pixels      object
ratio_of_reagion      object
outcome                int64
dtype: object

In [27]:
for i in df.columns:
    try:
        df[i]=df[i].astype(float)
    except:
        pass

In [28]:
df.dtypes

Perimeter            float64
Eccentricity         float64
Convex_Area          float64
Major_Axis_Length    float64
Minor_Axis_Length    float64
Number_of_pixels      object
ratio_of_reagion     float64
outcome              float64
dtype: object

In [29]:
df.iloc[546]["Number_of_pixels"]

'fourteen thousand, seven hundred'

In [30]:
df.Number_of_pixels[:30]

0                                               10729
1                                               11732
2           nine thousand, two hundred and twenty-one
3     eleven thousand, seven hundred and eighty-seven
4                                               12574
5                                               15107
6        ten thousand, five hundred and seventy-eight
7         ten thousand, four hundred and ninety-seven
8                     sixteen thousand and ninety-one
9                                               13998
10                                              15593
11        eleven thousand, five hundred and fifty-two
12                                              14998
13        ten thousand, eight hundred and eighty-nine
14        nine thousand, one hundred and twenty-seven
15         ten thousand, nine hundred and eighty-four
16                  fourteen thousand and seventy-six
17                                              12682
18                          

In [31]:
df["Number_of_pixels"]=df["Number_of_pixels"].apply(lambda x: x.replace("-"," "))

In [32]:
df["Number_of_pixels"]=df["Number_of_pixels"].apply(lambda x: x.replace("thousand","").replace("hundred",""))

In [33]:
df["Number_of_pixels"]=df["Number_of_pixels"].apply(lambda x: x.replace("  "," "))

In [34]:
a=["one","two","three","four","five","six","seven","eight","nine","ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen",
"eighteen","nineteen"]

In [35]:
dict1=dict(zip(a,range(1,20)))

In [36]:
b=["twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]

In [37]:
dict2=dict(zip(b,range(2,10)))

In [38]:
new_dict=dict1|dict2

In [39]:
new_dict

{'one': 1,
 'two': 2,
 'three': 3,
 'four': 4,
 'five': 5,
 'six': 6,
 'seven': 7,
 'eight': 8,
 'nine': 9,
 'ten': 10,
 'eleven': 11,
 'twelve': 12,
 'thirteen': 13,
 'fourteen': 14,
 'fifteen': 15,
 'sixteen': 16,
 'seventeen': 17,
 'eighteen': 18,
 'nineteen': 19,
 'twenty': 2,
 'thirty': 3,
 'forty': 4,
 'fifty': 5,
 'sixty': 6,
 'seventy': 7,
 'eighty': 8,
 'ninety': 9}

In [40]:
index1=[]
index2=[]
for i in df["Number_of_pixels"]:
    if "," in i:
        index1.append(df[df["Number_of_pixels"]==i].index.values[0])
    else:
        index2.append(df[df["Number_of_pixels"]==i].index.values[0])
            
            
            

In [41]:
df.iloc[index1,5]=df.iloc[index1,5].apply(lambda x: x.replace("and","").replace(",","").replace("  "," "))

In [42]:
df.iloc[index1,5]

2              nine two twenty one
3        eleven seven eighty seven
6           ten five seventy eight
7            ten four ninety seven
11           eleven five fifty two
                   ...            
2990         eleven six sixty nine
2991      thirteen seven sixty two
2993        twelve six eighty four
2997    thirteen three ninety nine
2999       eight eight forty eight
Name: Number_of_pixels, Length: 1625, dtype: object

In [43]:
new=[]
for i in df.iloc[index1,5]:
    new.append(i.split(" "))

In [44]:
for i in new:
    for j in i:
        try:
            i[i.index(j)]=new_dict[j]
        except:
            pass

In [45]:
son=[]
for i in new:
    son.append([str(j) for j in i])

In [46]:
son2=[]
for i in son:
    son2.append("".join(i))

In [47]:
df.iloc[index1,5]=son2

In [48]:
df.iloc[index1,5]

2        9221
3       11787
6       10578
7       10497
11      11552
        ...  
2990    11669
2991    13762
2993    12684
2997    13399
2999     8848
Name: Number_of_pixels, Length: 1625, dtype: object

In [49]:
new=[]
for i in df.iloc[index2,5]:
    new.append(i.split(" "))

In [50]:
for i in new:
    for j in i:
        try:
            i[i.index(j)]=new_dict[j]
        except:
            pass

In [51]:
son=[]
for i in new:
    son.append([str(j) for j in i])

In [52]:
son2=[]
for i in son:
    son2.append("".join(i))

In [53]:

son3=[i.replace("and","0") for i in son2]

In [54]:
df.iloc[index2,5]=son3

In [55]:
notdig=[]
for i in df["Number_of_pixels"]:
    if i.isdigit()==False:
        notdig.append(df[df["Number_of_pixels"]==i].index.values[0])

In [56]:
df=df.drop(notdig,0)

In [57]:
df.reset_index(drop=True, inplace=True)

In [58]:
# run only for test data
df["outcome"]=df["outcome"].astype(int)

In [59]:
notdig2=[]
for i in df["Number_of_pixels"]:
    if i.isdigit()==False:
        notdig2.append(df[df["Number_of_pixels"]==i].index.values[0])

In [60]:
df=df.drop(notdig2,0)
df.reset_index(drop=True, inplace=True)

In [61]:
notdig3=[]
for i in df["Number_of_pixels"]:
    if i.isdigit()==False:
        notdig3.append(df[df["Number_of_pixels"]==i].index.values[0])

In [62]:
notdig3

[2418, 2466, 2517]

In [63]:
df=df.drop(notdig3,0)

In [64]:
df.reset_index(drop=True, inplace=True)

In [65]:
df["Number_of_pixels"]=df["Number_of_pixels"].astype(int)

In [66]:
df.dtypes

Perimeter            float64
Eccentricity         float64
Convex_Area          float64
Major_Axis_Length    float64
Minor_Axis_Length    float64
Number_of_pixels       int32
ratio_of_reagion     float64
outcome                int32
dtype: object

In [68]:
df.head(15)

Unnamed: 0,Perimeter,Eccentricity,Convex_Area,Major_Axis_Length,Minor_Axis_Length,Number_of_pixels,ratio_of_reagion,outcome
0,414699000000000.0,0.88044,10987.0,171.729874,81.426979,10729,0.794211,1
1,441123000000000.0,0.872626,12076.0,176.48349,86.192467,11732,0.63747,1
2,387094000000000.0,0.882439,9418.0,159.224503,74.903328,9221,0.768417,1
3,434514000000000.0,0.888045,11965.0,181.290466,83.349335,11787,0.744129,1
4,444631000000000.0,0.845386,12964.0,174.560257,93.242401,12574,0.654725,1
5,488866000000000.0,0.887993,15322.0,205.21521,94.36982,15107,0.787233,0
6,414620000000000.0,0.873766,10899.0,167.751923,81.585701,10578,0.603974,1
7,417506000000000.0,0.880835,10753.0,169.240204,80.122292,10497,0.581616,1
8,508831000000000.0,0.880323,16431.0,209.183884,99.231758,16091,0.670458,0
9,484882000000000.0,0.889188,14328.0,198.721298,90.923409,13998,0.599512,0
