In [1]:
import pandas as pd
input_file = "datatrain.csv"


# comma delimited is the default
df = pd.read_csv(input_file,sep=';', header = 0)

# for space delimited use:
# df = pd.read_csv(input_file, header = 0, delimiter = " ")

# for tab delimited use:
# df = pd.read_csv(input_file, header = 0, delimiter = "\t")

# put the original column names in a python list
original_headers = list(df.columns.values)

# remove the non-numeric columns
df1 = df._get_numeric_data()


# put the numeric column names in a python list
numeric_headers = list(df1.columns.values)

# create a numpy array with the numeric values for input into scikit-learn
numpy_array = df1.values
numpy_array

array([[2004,  157,   30, ...,    3,   71,    1],
       [2000,  135,   79, ...,    7,   39,    2],
       [2007,  173,   96, ...,    4,   76,    3],
       ...,
       [1959,  168,   80, ...,    7,   74, 1393],
       [1959,  174,   26, ...,    4,   65, 1394],
       [1959,  133,   50, ...,    3,   56, 1395]], dtype=int64)

In [10]:
Y=df.loc[:,['Genre']]
Y = Y.values
Y

array([['ALTERNATIVOS'],
       ['ROCK'],
       ['METAL'],
       ...,
       ['ROCK'],
       ['JAZZ FUNK'],
       ['ALTERNATIVOS']], dtype=object)

In [6]:
df2 = df.select_dtypes(include=['object'])
df2

Unnamed: 0,Title,Artist,Genre
0,Sunrise,Norah Jones,ALTERNATIVOS
1,Black Night,Deep Purple,ROCK
2,The Pretender,Foo Fighters,METAL
3,Waitin' On A Sunny Day,Bruce Springsteen,ROCK
4,The Road Ahead (Miles Of The Unknown),City To City,ROCK
...,...,...,...
1390,Summertime,Louis Armstrong,ALTERNATIVOS
1391,Heartbreak Hotel,Elvis Presley,ALTERNATIVOS
1392,Johnny B. Goode,Chuck Berry,ROCK
1393,Take Five,The Dave Brubeck Quartet,JAZZ FUNK


In [11]:
from sklearn import svm,datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [12]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(numpy.array, Y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df