In [1]:
import numpy as np
# function untuk mencetak file arff
def pandas2arff(df,filename,wekaname = "pandasdata",cleanstringdata=True,cleannan=True):
    """
    converts the pandas dataframe to a weka compatible file
    df: dataframe in pandas format
    filename: the filename you want the weka compatible file to be in
    wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
    cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka. 
                     To suppress this, set this to False
    cleannan: replaces all nan values with "?" which is Weka's standard for missing values. 
              To suppress this, set this to False
    """
    import re
    
    def cleanstring(s):
        if s!="?":
            return re.sub('[^A-Za-z0-9]+', "_", str(s))
        else:
            return "?"
            
    dfcopy = df #all cleaning operations get done on this copy

    
    if cleannan!=False:
        dfcopy = dfcopy.fillna(-999999999) #this is so that we can swap this out for "?"
        #this makes sure that certain numerical columns with missing values don't get stuck with "object" type
 
    f = open(filename,"w")
    arffList = []
    arffList.append("@relation " + wekaname + "\n\n")
    #look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
    for i in range(df.shape[1]):
        if dfcopy.dtypes[i]=='O' or (df.columns[i] in ["Class","CLASS","class"]):
            if cleannan!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].replace(to_replace=-999999999, value="?")
            if cleanstringdata!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].apply(cleanstring)
            _uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:,i])]
            _uniqueNominalVals = ",".join(_uniqueNominalVals)
            _uniqueNominalVals = _uniqueNominalVals.replace("[","")
            _uniqueNominalVals = _uniqueNominalVals.replace("]","")
            _uniqueValuesString = "{" + _uniqueNominalVals +"}" 
            arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
        else:
            arffList.append("@attribute " + df.columns[i] + " numeric\n") 
            #even if it is an integer, let's just deal with it as a real number for now
    arffList.append("\n@data\n")           
    for i in range(dfcopy.shape[0]):#instances
        _instanceString = ""
        for j in range(df.shape[1]):#features
                if dfcopy.dtypes[j]=='O':
                    _instanceString+="\"" + str(dfcopy.iloc[i,j]) + "\""
                else:
                    _instanceString+=str(dfcopy.iloc[i,j])
                if j!=dfcopy.shape[1]-1:#if it's not the last feature, add a comma
                    _instanceString+=","
        _instanceString+="\n"
        if cleannan!=False:
            _instanceString = _instanceString.replace("-999999999.0","?") #for numeric missing values
            _instanceString = _instanceString.replace("\"?\"","?") #for categorical missing values
        arffList.append(_instanceString)
    f.writelines(arffList)
    f.close()
    del dfcopy
    return True

In [2]:
import pandas as pd
from scipy.io.arff import loadarff 

# read file to dataframe
raw_data = loadarff(open('DataAbalone.arff','r'))
df_data = pd.DataFrame(raw_data[0])
# membersihkan data rusak pada JenisKelamin
df_data['JenisKelamin'] = df_data['JenisKelamin'].str.decode('utf-8') 

df_data

Unnamed: 0,Diameter,Rings,Height,Length,ShellWeight,ShuckedWeight,WholeWeight,VisceraWeight,JenisKelamin
0,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7.0,M
1,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9.0,F
2,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10.0,M
3,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7.0,I
4,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8.0,I
...,...,...,...,...,...,...,...,...,...
4171,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11.0,F
4172,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10.0,M
4173,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9.0,M
4174,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10.0,F


In [3]:
# acak sample training sebasar 80%
df_training = df_data.sample(n = 3342)
df_training 

Unnamed: 0,Diameter,Rings,Height,Length,ShellWeight,ShuckedWeight,WholeWeight,VisceraWeight,JenisKelamin
3090,0.495,0.380,0.120,0.4740,0.1970,0.1065,0.1545,10.0,M
1653,0.595,0.460,0.150,0.8335,0.3770,0.1925,0.2350,8.0,I
4161,0.385,0.255,0.100,0.3175,0.1370,0.0680,0.0920,8.0,M
1918,0.605,0.490,0.165,1.0710,0.4820,0.1935,0.3520,10.0,I
3604,0.495,0.375,0.115,0.5755,0.3100,0.1145,0.1395,8.0,I
...,...,...,...,...,...,...,...,...,...
1314,0.550,0.425,0.135,0.7305,0.3325,0.1545,0.2150,9.0,I
531,0.470,0.370,0.120,0.4705,0.1845,0.1055,0.1550,12.0,I
2767,0.560,0.435,0.135,0.7200,0.3290,0.1030,0.2510,11.0,I
2008,0.430,0.320,0.110,0.3675,0.1675,0.1020,0.1050,8.0,I


In [4]:
# acak sample testing sebasar 20%
df_testing = df_data.sample(n = 835)
df_testing

Unnamed: 0,Diameter,Rings,Height,Length,ShellWeight,ShuckedWeight,WholeWeight,VisceraWeight,JenisKelamin
2819,0.375,0.285,0.090,0.2545,0.1190,0.0595,0.0675,6.0,I
1588,0.520,0.410,0.145,0.6460,0.2965,0.1595,0.1650,9.0,I
467,0.680,0.550,0.210,1.7445,0.5975,0.3050,0.6250,17.0,F
2847,0.625,0.485,0.160,1.2540,0.5910,0.2590,0.3485,9.0,F
2467,0.370,0.275,0.080,0.2270,0.0930,0.0625,0.0700,8.0,F
...,...,...,...,...,...,...,...,...,...
137,0.405,0.325,0.110,0.3555,0.1510,0.0630,0.1170,9.0,F
2990,0.595,0.485,0.150,1.0835,0.5305,0.2310,0.2760,8.0,M
1378,0.620,0.475,0.160,1.1295,0.4630,0.2685,0.3300,10.0,F
3944,0.235,0.175,0.065,0.0615,0.0205,0.0200,0.0190,6.0,I


In [5]:
pandas2arff(df_training, "Abalone-training.arff", wekaname="Abalone-training", cleanstringdata=True, cleannan=True)
pandas2arff(df_testing, "Abalone-testing.arff", wekaname="Abalone-testing", cleanstringdata=True, cleannan=True)

True