In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import glob
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import numpy as np

# Build Dataset

In [2]:
# build data
text_df = pd.DataFrame()
labels = pd.DataFrame()
fp_base = './Data_Categorical/'
violence_types = ['Physical violence', 'Bullying', 'Weapons use', 'Dating violence']
data = []
for vtype in violence_types:
    fp_complete = fp_base + vtype + '/*.txt'
    files_list = glob.glob(fp_complete)
    for fname in files_list:
        with open(fname, errors='ignore') as f:
            txt = f.read()
        data.append((txt, vtype))

In [3]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    """lemmatize and clean doc"""
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return ''.join([x for x in normalized if not x.isnumeric()])

In [4]:
text = []
label = []
# *NOTE: Better to use zip(*data)
for (t, l) in data:
    text.append(clean(t))
    label.append(l)

# Try TF->SVD->SVM

In [None]:
vocabs = ['dating', 'bullying', 'violence', 'sexual', 'weapons']
x_train, x_test, y_train, y_test = train_test_split(text, label, test_size=0.2)
tf_vect = TfidfVectorizer(use_idf=False, binary=True, max_features=100, ngram_range=(2,2))
x_train_tf = tf_vect.fit_transform(x_train)
x_test_tf = tf_vect.transform(x_test)
svd = TruncatedSVD(n_components=3)
x_train_tf_svd = svd.fit_transform(x_train_tf)
x_test_tf_svd = svd.transform(x_test_tf)
clf = LinearSVC()
clf.fit(x_train_tf, y_train)
print("******Test Result******")
print("Test Set Size:", len(x_test))
print("Score:", sum(clf.predict(x_test_tf) == y_test) / len(x_test))

# Attempt to use matrix distance

In [6]:
from sklearn.linear_model import LogisticRegression
from scipy.spatial import distance
import numpy as np
import matplotlib.pyplot as plt

In [None]:
len(dating_violence_texts)

In [11]:
dating_violence_texts = [x[0] for x in data if x[1] == 'Dating violence']
dating_vectorizer = TfidfVectorizer(binary=True, max_features=100)
model_vector = dating_vectorizer.fit_transform(dating_violence_texts[:20])
test_vector = dating_vectorizer.transform(dating_violence_texts[21:])
print(type(dating_violence_texts[1]))

<class 'str'>


In [None]:
vector_average = np.mean(model_vector, axis=0)

In [None]:
np.mean(distance.cdist(vector_average, test_vector[6].toarray(), 'euclidean'))

In [None]:
weapon_text = [x[0] for x in data if x[1] == 'Weapons use']
test_weapon_vector = dating_vectorizer.transform(weapon_text)

In [None]:
weapon_means = []
for tv in test_weapon_vector:
    weapon_means.append(np.mean(distance.cdist(vector_average, tv.toarray(), 'euclidean')))
weapon_means = np.array(weapon_means)

In [None]:
plt.hist(weapon_means)
plt.xlim(0,1)
print(len(weapon_means[weapon_means <= 0.25]) / len(weapon_means))

In [None]:
dating_means = []
for tv in test_vector: # dating
    dating_means.append(np.mean(distance.cdist(vector_average, tv.toarray(), 'euclidean')))
dating_means = np.array(dating_means)

In [None]:
plt.hist(dating_means)
plt.xlim(0,1)
print(len(dating_means[dating_means <= 0.25])/ len(dating_means))

In [None]:
phys_text = [x[0] for x in data if x[1] == 'Physical violence']
test_phys_vector = dating_vectorizer.transform(phys_text)

In [None]:
phys_means = []
for tv in test_phys_vector:
    phys_means.append(np.mean(distance.cdist(vector_average, tv.toarray(), 'euclidean')))
phys_means = np.array(phys_means)

In [None]:
plt.hist(phys_means)
plt.xlim(0,1)
print(len(phys_means[phys_means <= 0.25])/ len(phys_means))

In [5]:
phys_text = [x[0] for x in data if x[1] == 'Physical violence']
dating_text = [x[0] for x in data if x[1] == 'Dating violence']
bully_text = [x[0] for x in data if x[1] == 'Bullying']
weapon_text = [x[0] for x in data if x[1] == 'Weapons use']

In [6]:
from vector_distance_classifier import VDC
dating_clf = VDC()
dating_clf.fit(dating_text)
phys_clf = VDC()
phys_clf.fit(phys_text)
weapons_clf = VDC()
weapons_clf.fit(weapon_text)
bully_clf = VDC()
bully_clf.fit(bully_text)

In [7]:
from sklearn.model_selection import KFold
from vector_distance_classifier import VDC

In [23]:
def testModel(clf, data):
    kf = KFold(n_splits=4, random_state=123)
    for train_index, test_index in kf.split(data):
        #print(train_index)
        #print(test_index)
        clf.fit(np.array(data)[train_index].tolist())
        predicted = clf.predict(np.array(data)[test_index].tolist())
        predicted_np = np.array(predicted)
        #print(predicted)
        print("**Result:", sum(predicted_np) / len(test_index))

In [32]:
sum(bully_clf.predict(weapon_text)) / len(weapon_text)

0.5287356321839081

In [None]:
x = [1,2,3]

In [None]:
train_index.__type__()