# Text classification

In [1]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
newsgroups_data = fetch_20newsgroups()

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english')
newsgroups_data_transformed = tfidf_vect.fit_transform(newsgroups_data.data)

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
x_train, x_test, y_train, y_test = train_test_split(newsgroups_data_transformed, 
                                                    newsgroups_data.target, 
                                                    shuffle=True,
                                                    test_size = 0.2)
mlp_clf = MLPClassifier(activation = 'relu',
                    hidden_layer_sizes= (32,), 
                    solver='adam', 
                    verbose=True,
                    max_iter=50)
mlp_clf.fit(x_train, y_train)
y_pred = mlp_clf.predict(x_test)
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.sample(10)

Iteration 1, loss = 2.91613671
Iteration 2, loss = 2.54239889
Iteration 3, loss = 2.02274779
Iteration 4, loss = 1.46622081
Iteration 5, loss = 0.99836980
Iteration 6, loss = 0.67093322
Iteration 7, loss = 0.46111104
Iteration 8, loss = 0.32804198
Iteration 9, loss = 0.24176157
Iteration 10, loss = 0.18375981
Iteration 11, loss = 0.14329278
Iteration 12, loss = 0.11442269
Iteration 13, loss = 0.09338132
Iteration 14, loss = 0.07759173
Iteration 15, loss = 0.06555456
Iteration 16, loss = 0.05616589
Iteration 17, loss = 0.04889062
Iteration 18, loss = 0.04295606
Iteration 19, loss = 0.03821200
Iteration 20, loss = 0.03428952
Iteration 21, loss = 0.03105914
Iteration 22, loss = 0.02828196
Iteration 23, loss = 0.02597945
Iteration 24, loss = 0.02403663
Iteration 25, loss = 0.02232081
Iteration 26, loss = 0.02083203
Iteration 27, loss = 0.01958676
Iteration 28, loss = 0.01848721
Iteration 29, loss = 0.01750446
Iteration 30, loss = 0.01657435
Iteration 31, loss = 0.01579920
Iteration 32, los



Unnamed: 0,y_test,y_pred
1064,7,7
34,0,0
1523,17,17
89,19,19
1059,5,2
1746,2,2
1272,4,1
171,17,17
1406,15,15
709,3,6


In [2]:
newsgroups_data_crosstab = pd.crosstab(pred_results.y_test, pred_results.y_pred)
newsgroups_data_crosstab

y_pred,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,1
1,0,110,2,2,2,5,1,0,0,0,0,0,1,1,0,0,0,0,0,0
2,0,1,93,6,2,2,1,0,0,1,0,0,1,0,1,0,0,0,0,0
3,0,4,2,99,7,0,5,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,2,0,5,88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,4,5,3,0,100,1,0,0,0,0,2,1,0,0,0,0,0,0,0
6,0,0,1,1,1,0,103,0,0,1,1,0,2,0,1,0,0,0,0,0
7,0,0,0,0,2,0,4,119,1,2,0,0,2,0,1,0,1,0,0,0
8,0,1,0,0,0,0,2,2,125,0,0,0,0,0,0,0,0,0,0,0
9,0,0,1,1,0,0,0,1,0,108,0,0,0,1,0,0,0,0,0,0


In [3]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9209014582412727

# Image classification

In [4]:
# !pip install opencv-python
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os

DATASET_PATH = 'datasets/Lego/train/'

In [5]:
def load_images(path):
    
    images_train = []
    labels_train = []
    
    for sub_folder in os.listdir(path):
        fullpath = os.path.join(path, sub_folder)
        print(fullpath)

        if not os.path.isdir(fullpath):
            continue

        images = os.listdir(fullpath)
        
        for image_filename in images:
            
            image_fullpath = os.path.join(fullpath, image_filename)

            if os.path.isdir(image_fullpath):
                continue
            
            img = cv2.imread(image_fullpath)

            images_train.append(img)
            labels_train.append(sub_folder)
            
    return np.array(images_train), np.array(labels_train)

In [7]:
# images_train, labels_train = load_images(DATASET_PATH)

# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# Y = encoder.fit_transform(labels_train)

# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(X, Y, shuffle=True, test_size = 0.2)
# x_train.shape, y_train.shape

# from sklearn.neural_network import MLPClassifier
# clf_image = MLPClassifier(activation = 'relu',
#                          hidden_layer_sizes = (100, 100, 100), 
#                          solver='adam', 
#                          verbose=True,
#                          max_iter=100)
# clf_image.fit(x_train, y_train)
# y_pred = clf_image.predict(x_test)

# from sklearn.metrics import accuracy_score
# accuracy_score(y_test, y_pred)