-
Notifications
You must be signed in to change notification settings - Fork 0
/
PCA_all_data.py
61 lines (46 loc) · 1.93 KB
/
PCA_all_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
# from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Load in the data
train_data = pd.read_csv("./Data/train.csv")
# Split into training and testing DataFrames
x_train = train_data[["id", "qid1", "qid2", "question1", "question2"]]
y_train = train_data["is_duplicate"]
# Separate the questions into their own DataFrames for vectorization
train_question1s = train_data["question1"].astype(str).tolist()
train_question2s = train_data["question2"].astype(str).tolist()
# Combine the two for vectorization and dimension reduction
train_questions = train_question1s + train_question2s
# Declare the vectorizer
vectorizer = TfidfVectorizer(max_features=257)
# Fit and transform the data
train_tfidfs = vectorizer.fit_transform(train_questions)
print(train_tfidfs.shape)
# Declare the "PCA"
# NOTE: We can't use PCA because our matrix is very sparse
pca = TruncatedSVD(n_components=256)
# Fir and transform the data
reduced_tfidfs = pca.fit_transform(train_tfidfs, pd.concat([y_train, y_train]))
# Take some statistics from the PCA
explained_variance = pca.explained_variance_ratio_
singular_values = pca.singular_values_
# Plot the explained variance
plt.figure()
explained_variance_plt = plt.plot(explained_variance)
plt.title('Variance Explained by Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance')
plt.savefig("./Figures/ExplainedVariance.png", dpi=700)
# Plot the singular values
plt.figure()
singular_values_plt = plt.plot(singular_values)
plt.title('Singular Values by Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('Singular Values')
plt.savefig("./Figures/SingularValues.png", dpi=700)
plt.show()
# Split the questions back into separate matrices
[train_question1_tfidfs, train_question2_tfidfs] = np.split(reduced_tfidfs, 2)