-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
74 lines (58 loc) · 2.81 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader
from dataset import WrapperDataset
from constants import *
from training import train, evaluate
from models import Network
# Load the data
if __name__ == '__main__':
df = pd.read_csv('data\\news.csv')
print('Shape of the data: {}'.format(df.shape))
# Drop Unnamed:0
df = df.drop('Unnamed: 0', axis=1)
# Extract the label
labels = df.label
# Train and test splitting
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=1)
# Train and validation splitting
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)
# Let's plot the distribution of the classes
plt.figure(figsize=(12, 5))
plt.hist(y_train)
plt.show()
# Initialize a TF-IDF vectorizer. Stop words are taken from English vocabulary,
# with a maximum term frequency MAX_TERM_FREQUENCY,
# i.e. term with a frequency larger than MAX_TERM_FREQUENCY will be discarded
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=MAX_TERM_FREQUENCY)
# Let's fit it on train and test
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_val = tfidf_vectorizer.transform(x_val)
tfidf_test = tfidf_vectorizer.transform(x_test)
# Create a dataframe out of tfidf_train
df_idf = pd.DataFrame(tfidf_vectorizer.idf_, index=tfidf_vectorizer.get_feature_names(), columns=['idf_weights'])
df_idf.reset_index(inplace=True)
df_idf.rename(columns={'index': 'word'}, inplace=True)
# Let's see the words with the highest TF-IDF
sorted_idf = df_idf.sort_values(by='idf_weights')
sorted_idf_top_10 = sorted_idf.head(10)
plt.figure(figsize=(12, 5))
plt.hist(sorted_idf_top_10.word, weights=sorted_idf_top_10.idf_weights)
# plt.show()
# Let's generate the dataset and the loaders
train_dataset = WrapperDataset(tfidf_train.toarray(), y_train)
valid_dataset = WrapperDataset(tfidf_val.toarray(), y_val)
test_dataset = WrapperDataset(tfidf_test.toarray(), y_test)
train_dl = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dl = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Init the network
net = Network(input=tfidf_train.shape[1], layers=3, hidden=1000, output=2)
# Training
net = train(net, train_dl, valid_dl, epochs=500, early_stopping=False)
# Test
test_accuracy, test_loss = evaluate(net, test_dl, torch.nn.CrossEntropyLoss())
print('Final test accuracy: {:.2f}'.format(test_accuracy))