In [1]:
import numpy as np

def pandas2arff(df,filename,wekaname = "pandasdata",cleanstringdata=True,cleannan=True):
    """
    converts the pandas dataframe to a weka compatible file
    df: dataframe in pandas format
    filename: the filename you want the weka compatible file to be in
    wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
    cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka. 
                     To suppress this, set this to False
    cleannan: replaces all nan values with "?" which is Weka's standard for missing values. 
              To suppress this, set this to False
    """
    import re
    
    def cleanstring(s):
        if s!="?":
            return re.sub('[^A-Za-z0-9]+', "_", str(s))
        else:
            return "?"
            
    dfcopy = df #all cleaning operations get done on this copy

    
    if cleannan!=False:
        dfcopy = dfcopy.fillna(-999999999) #this is so that we can swap this out for "?"
        #this makes sure that certain numerical columns with missing values don't get stuck with "object" type
 
    f = open(filename,"w")
    arffList = []
    arffList.append("@relation " + wekaname + "\n\n")
    #look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
    for i in range(df.shape[1]):
        if dfcopy.dtypes[i]=='O' or (df.columns[i] in ["Class","CLASS","class"]):
            if cleannan!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].replace(to_replace=-999999999, value="?")
            if cleanstringdata!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].apply(cleanstring)
            _uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:,i])]
            _uniqueNominalVals = ",".join(_uniqueNominalVals)
            _uniqueNominalVals = _uniqueNominalVals.replace("[","")
            _uniqueNominalVals = _uniqueNominalVals.replace("]","")
            _uniqueValuesString = "{" + _uniqueNominalVals +"}" 
            arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
        else:
            arffList.append("@attribute " + df.columns[i] + " numeric\n") 
            #even if it is an integer, let's just deal with it as a real number for now
    arffList.append("\n@data\n")           
    for i in range(dfcopy.shape[0]):#instances
        _instanceString = ""
        for j in range(df.shape[1]):#features
                if dfcopy.dtypes[j]=='O':
                    _instanceString+="\"" + str(dfcopy.iloc[i,j]) + "\""
                else:
                    _instanceString+=str(dfcopy.iloc[i,j])
                if j!=dfcopy.shape[1]-1:#if it's not the last feature, add a comma
                    _instanceString+=","
        _instanceString+="\n"
        if cleannan!=False:
            _instanceString = _instanceString.replace("-999999999.0","?") #for numeric missing values
            _instanceString = _instanceString.replace("\"?\"","?") #for categorical missing values
        arffList.append(_instanceString)
    f.writelines(arffList)
    f.close()
    del dfcopy
    return True

In [2]:
import pandas as pd
from scipy.io.arff import loadarff 

# read file to dataframe
raw_data = loadarff(open('arffWdbc.arff','r'))
df_data = pd.DataFrame(raw_data[0])
# membersihkan data rusak pada Diagnosis
df_data['Diagnosis'] = df_data['Diagnosis'].str.decode('utf-8') 

df_data

Unnamed: 0,Mean-radius,Mean-texture,Mean-perimeter,Mean-area,Mean-smoothness,Mean-compactness,Mean-concavity,Mean-concavePoints,Mean-symmetry,Mean-fractalDimension,...,Worth-texture,Worth-perimeter,Worth-area,Worth-smoothness,Worth-compactness,Worth-concavity,Worth-concavePoints,Worth-symmetry,Worth-fractalDimension,Diagnosis
0,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,M
1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,M
2,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,M
3,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,M
4,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,0.07613,...,23.75,103.40,741.6,0.17910,0.52490,0.5355,0.1741,0.3985,0.12440,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,M
564,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,M
565,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,M
566,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,M


In [3]:
# acak sample training sebasar 70%
df_training = df_data.sample(n = 399)
df_training  

Unnamed: 0,Mean-radius,Mean-texture,Mean-perimeter,Mean-area,Mean-smoothness,Mean-compactness,Mean-concavity,Mean-concavePoints,Mean-symmetry,Mean-fractalDimension,...,Worth-texture,Worth-perimeter,Worth-area,Worth-smoothness,Worth-compactness,Worth-concavity,Worth-concavePoints,Worth-symmetry,Worth-fractalDimension,Diagnosis
207,13.110,22.54,87.02,529.4,0.10020,0.14830,0.087050,0.051020,0.1850,0.07310,...,29.16,99.48,639.3,0.1349,0.4402,0.31620,0.11260,0.4128,0.10760,B
237,14.220,27.85,92.55,623.9,0.08223,0.10390,0.110300,0.044080,0.1342,0.06129,...,40.54,102.50,764.0,0.1081,0.2426,0.30640,0.08219,0.1890,0.07796,B
19,13.080,15.71,85.63,520.0,0.10750,0.12700,0.045680,0.031100,0.1967,0.06811,...,20.49,96.09,630.5,0.1312,0.2776,0.18900,0.07283,0.3184,0.08183,B
455,11.630,29.29,74.87,415.1,0.09357,0.08574,0.071600,0.020170,0.1799,0.06166,...,38.81,86.04,527.8,0.1406,0.2031,0.29230,0.06835,0.2884,0.07220,B
257,15.660,23.20,110.20,773.5,0.11090,0.31140,0.317600,0.137700,0.2495,0.08104,...,31.64,143.70,1226.0,0.1504,0.5172,0.61810,0.24620,0.3277,0.10190,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,11.810,17.39,75.27,428.9,0.10070,0.05562,0.023530,0.015530,0.1718,0.05780,...,26.48,79.57,489.5,0.1356,0.1000,0.08803,0.04306,0.3200,0.06576,B
418,11.160,21.41,70.95,380.3,0.10180,0.05978,0.008955,0.010760,0.1615,0.06144,...,28.92,79.26,458.0,0.1282,0.1108,0.03582,0.04306,0.2976,0.07123,B
564,20.130,28.25,131.20,1261.0,0.09780,0.10340,0.144000,0.097910,0.1752,0.05533,...,38.25,155.00,1731.0,0.1166,0.1922,0.32150,0.16280,0.2572,0.06637,M
268,10.710,20.39,69.50,344.9,0.10820,0.12890,0.084480,0.028670,0.1668,0.06862,...,25.21,76.51,410.4,0.1335,0.2550,0.25340,0.08600,0.2605,0.08701,B


In [4]:
# acak sample testing sebasar 30%
df_testing = df_data.sample(n = 170)
df_testing

Unnamed: 0,Mean-radius,Mean-texture,Mean-perimeter,Mean-area,Mean-smoothness,Mean-compactness,Mean-concavity,Mean-concavePoints,Mean-symmetry,Mean-fractalDimension,...,Worth-texture,Worth-perimeter,Worth-area,Worth-smoothness,Worth-compactness,Worth-concavity,Worth-concavePoints,Worth-symmetry,Worth-fractalDimension,Diagnosis
156,16.840,19.46,108.40,880.2,0.07445,0.07223,0.05150,0.02771,0.1844,0.05268,...,28.07,120.30,1032.0,0.08774,0.1710,0.1882,0.08436,0.2527,0.05972,B
102,9.876,19.40,63.95,298.3,0.10050,0.09697,0.06154,0.03029,0.1945,0.06322,...,26.83,72.22,361.2,0.15590,0.2302,0.2644,0.09749,0.2622,0.08490,B
16,16.130,20.68,108.10,798.8,0.11700,0.20220,0.17220,0.10280,0.2164,0.07356,...,31.48,136.80,1315.0,0.17890,0.4233,0.4784,0.20730,0.3706,0.11420,M
355,13.050,18.59,85.09,512.0,0.10820,0.13040,0.09603,0.05603,0.2035,0.06501,...,24.85,94.22,591.2,0.13430,0.2658,0.2573,0.12580,0.3113,0.08317,B
459,17.080,27.15,111.20,930.9,0.09898,0.11100,0.10070,0.06431,0.1793,0.06281,...,34.49,152.10,1648.0,0.16000,0.2444,0.2639,0.15550,0.3010,0.09060,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,14.250,21.72,93.63,633.0,0.09823,0.10980,0.13190,0.05598,0.1885,0.06125,...,30.36,116.20,799.6,0.14460,0.4238,0.5186,0.14470,0.3591,0.10140,M
261,17.290,22.13,114.40,947.8,0.08999,0.12730,0.09697,0.07507,0.2108,0.05464,...,27.24,137.90,1295.0,0.11340,0.2867,0.2298,0.15280,0.3067,0.07484,M
469,9.667,18.49,61.49,289.1,0.08946,0.06258,0.02948,0.01514,0.2238,0.06413,...,25.62,70.88,385.2,0.12340,0.1542,0.1277,0.06560,0.3174,0.08524,B
28,17.570,15.05,115.00,955.1,0.09847,0.11570,0.09875,0.07953,0.1739,0.06149,...,19.52,134.90,1227.0,0.12550,0.2812,0.2489,0.14560,0.2756,0.07919,M


In [5]:
# save file arrf setelah diacak
pandas2arff(df_training, "WBDC-training.arff", wekaname="WBDC-training", cleanstringdata=True, cleannan=True)
pandas2arff(df_testing, "WBDC-testing.arff", wekaname="WBDC-testing", cleanstringdata=True, cleannan=True)


True