# Mining Text Data

We will use the 20 Newsgroups text dataset (https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html).

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import timeit
import nltk
from yellowbrick.classifier import ClassificationReport,ConfusionMatrix

In [2]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running"))  # Output: 'run'
print(lemmatizer.lemmatize("geese"))    # Output: 'goose'


running
goose


In [3]:
nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\cui10\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cui10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
dataset = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))
texts = dataset.data
target = dataset.target

##Classification

In [5]:
data_train = fetch_20newsgroups(subset='train', random_state=21)
train_label = data_train.target
data_test = fetch_20newsgroups(subset='test', random_state=21)
test_label = data_test.target
len(data_train.data), len(data_test.data), len(test_label)

(11314, 7532, 7532)

In [6]:
np.unique(test_label)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [None]:
all_names = names.words()
WNL = WordNetLemmatizer()
def clean(data):
    cleaned = defaultdict(list)
    count = 0
    for group in data:
        for words in group.split():
            if words.isalpha() and words not in all_names:
                cleaned[count].append(WNL.lemmatize(words.lower()))
        cleaned[count] = ' '.join(cleaned[count])
        count +=1
    return(list(cleaned.values()))

In [8]:
x_train = clean(data_train.data)
x_test = clean(data_test.data)

In [9]:
tf = TfidfVectorizer(stop_words='english', max_features=4000)
X_train = tf.fit_transform(x_train)
X_test = tf.transform(x_test)
Y_train = train_label
Y_test = test_label
X_train.shape, X_test.shape

((11314, 4000), (7532, 4000))

In [10]:
train_label.shape

(11314,)

In [11]:
svc_lib = SVC(kernel = 'linear')

In [12]:
parameters = {'C' : (0.5,1.0,10,100)}
grid_search1 =GridSearchCV(svc_lib, parameters, n_jobs = -1, cv = 3)
start_time = timeit.default_timer()
grid_search1.fit(X_train, train_label)
final = timeit.default_timer()-start_time
print("Execution Time : ",final,'s')

Execution Time :  81.94165370000701 s


In [13]:
print(grid_search1.best_params_)
print(grid_search1.best_score_)

{'C': 1.0}
0.8291506763256903


In [14]:
grid_search_best1 = grid_search1.best_estimator_
accur1 = grid_search_best1.score(X_test, test_label)
print(accur1)

0.7361922464152948


In [15]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tf_id', TfidfVectorizer(stop_words = "english")), ('svm_im', LinearSVC())])
pipeline

parameter = {'tf_id__max_features' : (100,1000, 2000, 8000),
             'tf_id__max_df' : (0.25, 0.5),
             'tf_id__smooth_idf' : (True, False),
             'tf_id__sublinear_tf' : (True, False)
}

In [16]:
grid_search = GridSearchCV(pipeline, parameter,cv = 3)
grid_search.fit(x_train, train_label)





KeyboardInterrupt: 

In [None]:
print(grid_search.best_params_)

In [None]:
print(grid_search.best_score_)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
clf=MultinomialNB()
#fitting the classifier on training data
clf.fit(X_train,Y_train)
#prediciting the classes of the testing data
Y_pred=clf.predict(X_test)
#classification report
print(classification_report(Y_test,Y_pred))
#testing score
print("Testing: ",clf.score(X_test,Y_test))

In [None]:
visualizer = ConfusionMatrix(clf)
visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
g = visualizer.poof()

In [None]:
visualizer = ClassificationReport(clf)
visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
g = visualizer.poof()

# Mining Time Series Data

## The Occupancy Detection Data Set

We use the Occupancy Detection Data Set from the UCI repository (https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+). This dataset is used to build classifiers for detecting the presence of the occupants in an office room using light, temperature, humidity, and CO2 measurements. Accurate occupancy detection of an office room is an important problem as it may help to save energy in the order of 30 to 42% [7, 8]. Detecting occupancy without using a camera can help in situations where there are privacy concerns [7].

Download the dataset:

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip
!unzip occupancy_data.zip

Read datatraining.txt and print its few first items:

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from yellowbrick.classifier import ClassificationReport,ConfusionMatrix


df = pd.read_csv('datatraining.txt')
df.head()

df_test1 = pd.read_csv('datatest.txt')
df_test2 = pd.read_csv('datatest2.txt')

In [None]:
df[['Temperature', 'Humidity']].plot(figsize=(12,6))

##Classification

###Data preprocessing

We will use these columns: date, Temperature, Humidity, Light, CO2, HumidityRatio for classification. The date column will be exploited by dividing it into two columns Time stamp and Week Status. Timestamp is computed as the number of seconds from midnight for each day. Week Status is either 0 (weekend) or 1 (weekday).

In [None]:
df['Week Status'] = df.apply(lambda x: 1 if(pd.to_datetime(x['date']).weekday() < 5) else 0, axis=1)
df['Time stamp'] = df.apply(lambda x: (pd.to_datetime(x['date']) - pd.to_datetime(x['date']).replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds(),axis=1)
df.head(4000)

df_test1['Week Status'] = df_test1.apply(lambda x: 1 if(pd.to_datetime(x['date']).weekday() < 5) else 0, axis=1)
df_test1['Time stamp'] = df_test1.apply(lambda x: (pd.to_datetime(x['date']) - pd.to_datetime(x['date']).replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds(),axis=1)

df_test2['Week Status'] = df_test2.apply(lambda x: 1 if(pd.to_datetime(x['date']).weekday() < 5) else 0, axis=1)
df_test2['Time stamp'] = df_test2.apply(lambda x: (pd.to_datetime(x['date']) - pd.to_datetime(x['date']).replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds(),axis=1)

df.head(4000)


In [None]:
feature_names = ['Temperature', 'Humidity', 'Light', 'CO2','HumidityRatio', 'Week Status', 'Time stamp']
target_name = 'Occupancy'

X = df[feature_names]
y = df[target_name]

X_test1 = df_test1[feature_names]
y_test1 = df_test1[target_name]

X_test2 = df_test2[feature_names]
y_test2 = df_test2[target_name]

classes = ['unoccupied', 'occupied']

###Decision Tree Classifier

In [None]:
from sklearn import tree
import pydotplus
from IPython.display import Image


clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=3)
clf = clf.fit(X, y)

dot_data = tree.export_graphviz(clf, feature_names=X.columns, class_names=classes, filled=True, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

Model Selection:

In [None]:
import numpy as np
import matplotlib.pyplot as plt

#########################################
# Training and Test set creation
#########################################

from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.8, random_state=1)

from sklearn import tree
from sklearn.metrics import accuracy_score

#########################################
# Model fitting and evaluation
#########################################

maxdepths = [2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50]

trainAcc = np.zeros(len(maxdepths))
testAcc = np.zeros(len(maxdepths))

index = 0
for depth in maxdepths:
    clf = tree.DecisionTreeClassifier(max_depth=depth,random_state=123)
    clf = clf.fit(X_train, Y_train)
    Y_predTrain = clf.predict(X_train)
    Y_predTest = clf.predict(X_val)
    trainAcc[index] = accuracy_score(Y_train, Y_predTrain)
    testAcc[index] = accuracy_score(Y_val, Y_predTest)
    index += 1

#########################################
# Plot of training and test accuracies
#########################################

plt.plot(maxdepths,trainAcc,'ro-',maxdepths,testAcc,'bv--')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Max depth')
plt.ylabel('Accuracy')

Testing:

In [None]:
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=5, random_state=123)
clf = clf.fit(X, y)

dot_data = tree.export_graphviz(clf, feature_names=X.columns, class_names=classes, filled=True, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
visualizer = ClassificationReport(clf, classes=classes)
visualizer.score(X_test1, y_test1)
g = visualizer.poof()

In [None]:
visualizer = ConfusionMatrix(clf)
visualizer.score(X_test1, y_test1)
g = visualizer.poof()

In [None]:
visualizer = ClassificationReport(clf, classes=classes)
visualizer.score(X_test2, y_test2)
g = visualizer.poof()

In [None]:
visualizer = ConfusionMatrix(clf)
visualizer.score(X_test2, y_test2)
g = visualizer.poof()

###kNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline

numNeighbors = [1, 5, 10, 15, 20, 25, 30]
trainAcc = []
testAcc = []

for k in numNeighbors:
    clf = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    clf.fit(X_train, Y_train)
    Y_predTrain = clf.predict(X_train)
    Y_predTest = clf.predict(X_val)
    trainAcc.append(accuracy_score(Y_train, Y_predTrain))
    testAcc.append(accuracy_score(Y_val, Y_predTest))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc,'bv--')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')

###SVM

In [None]:
from sklearn.svm import SVC

C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]
SVMtrainAcc = []
SVMtestAcc = []

for param in C:
    clf = SVC(C=param,kernel='linear')
    clf.fit(X_train, Y_train)
    Y_predTrain = clf.predict(X_train)
    Y_predTest = clf.predict(X_val)
    SVMtrainAcc.append(accuracy_score(Y_train, Y_predTrain))
    SVMtestAcc.append(accuracy_score(Y_val, Y_predTest))


plt.plot(C, SVMtrainAcc, 'ro-', C, SVMtestAcc,'bv--')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('C')
plt.xscale('log')
plt.ylabel('Accuracy')

###Nonlinear Support Vector Machine

In [None]:
from sklearn.svm import SVC

C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]
SVMtrainAcc = []
SVMtestAcc = []

for param in C:
    clf = SVC(C=param,kernel='rbf',gamma='auto')
    clf.fit(X_train, Y_train)
    Y_predTrain = clf.predict(X_train)
    Y_predTest = clf.predict(X_val)
    SVMtrainAcc.append(accuracy_score(Y_train, Y_predTrain))
    SVMtestAcc.append(accuracy_score(Y_val, Y_predTest))

plt.plot(C, SVMtrainAcc, 'ro-', C, SVMtestAcc,'bv--')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('C')
plt.xscale('log')
plt.ylabel('Accuracy')

### Ensemble methods

In [None]:
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier

numBaseClassifiers = 500
maxdepth = 10
trainAcc = []
testAcc = []

clf = ensemble.RandomForestClassifier(n_estimators=numBaseClassifiers)
clf.fit(X_train, Y_train)
Y_predTrain = clf.predict(X_train)
Y_predTest = clf.predict(X_val)
trainAcc.append(accuracy_score(Y_train, Y_predTrain))
testAcc.append(accuracy_score(Y_val, Y_predTest))

clf = ensemble.BaggingClassifier(DecisionTreeClassifier(max_depth=maxdepth),n_estimators=numBaseClassifiers)
clf.fit(X_train, Y_train)
Y_predTrain = clf.predict(X_train)
Y_predTest = clf.predict(X_val)
trainAcc.append(accuracy_score(Y_train, Y_predTrain))
testAcc.append(accuracy_score(Y_val, Y_predTest))

clf = ensemble.AdaBoostClassifier(DecisionTreeClassifier(max_depth=maxdepth),n_estimators=numBaseClassifiers)
clf.fit(X_train, Y_train)
Y_predTrain = clf.predict(X_train)
Y_predTest = clf.predict(X_val)
trainAcc.append(accuracy_score(Y_train, Y_predTrain))
testAcc.append(accuracy_score(Y_val, Y_predTest))

methods = ['Random Forest', 'Bagging', 'AdaBoost']
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
ax1.bar([1.5,2.5,3.5], trainAcc)
ax1.set_xticks([1.5,2.5,3.5])
ax1.set_xticklabels(methods)
ax2.bar([1.5,2.5,3.5], testAcc)
ax2.set_xticks([1.5,2.5,3.5])
ax2.set_xticklabels(methods)

#References
1. [Data Mining - Charu Aggarwal](https://http://www.charuaggarwal.net/Data-Mining.htm) (Chapter 13, 14)
3.[Text Classification : 20 Newsgroup](https://github.com/topics/20newsgroup)
4.[Time Series : Anomaly Detection](https://towardsdatascience.com/time-series-of-price-anomaly-detection-13586cd5ff46)
5.[Github : NLP With Python](https://github.com/susanli2016)
6.[Kaggle : Occupancy Dataset](https://www.kaggle.com/robmarkcole/occupancy-detection-data-set-uci/notebooks)
7. Candanedo, L. M., & Feldheim, V. (2016). Accurate occupancy detection of an office room from light, temperature, humidity and CO2 measurements using statistical learning models. Energy and Buildings, 112, 28-39.
8. Erickson, V. L., Carreira-Perpiñán, M. Á., & Cerpa, A. E. (2014). Occupancy modeling and prediction for building energy management. ACM Transactions on Sensor Networks (TOSN), 10(3), 1-28.
9. https://www.kaggle.com/code/neerajmohan/fine-tuning-bert-for-text-classification
10. https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4
11. https://radimrehurek.com/gensim/models/doc2vec.html