In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
df_train = pd.read_csv('train.csv').dropna()
display(df_train.head())
df_test = pd.read_csv('test.csv').dropna()
df_test.head()

Unnamed: 0,Id,Comment,Topic
0,0x840,A few things. You might have negative- frequen...,Biology
1,0xbf0,Is it so hard to believe that there exist part...,Physics
2,0x1dfc,There are bees,Biology
3,0xc7e,I'm a medication technician. And that's alot o...,Biology
4,0xbba,Cesium is such a pretty metal.,Chemistry


Unnamed: 0,Id,Comment,Topic
0,0x1aa9,Personally I have no idea what my IQ is. I’ve ...,Biology
1,0x25e,I'm skeptical. A heavier lid would be needed t...,Physics
2,0x1248,I think I have 100 cm of books on the subject....,Biology
3,0x2b9,Is chemistry hard in uni. Ive read somewhere t...,Chemistry
4,0x24af,"In addition to the other comment, you can crit...",Physics


In [3]:
comments_train = df_train['Comment'].to_numpy()
print(comments_train[:2])
topics_train = df_train['Topic'].to_numpy()
topics_train[:2]
del df_train

['A few things. You might have negative- frequency dependent selection going on where the least common phenotype, reflected by genotype, is going to have an advantage in the environment. For instance, if a prey animal such as a vole were to have a light and a dark phenotype, a predator might recognize the more common phenotype as food.  So if the light voles are more common, foxes may be keeping a closer eye out for light phenotypic voles, recognising them as good prey. This would reduce the light causing alleles due to increased predation and the dark genotypes would increase their proportion of the population until this scenario is reversed. This cycle continues perpetually. \\n\\nHowever, this is unlikely to be strictly yearly as it usually takes more time than a year for an entire populations allele frequencies to change enough to make a large enough difference to alter fitness. \\n\\nMore likely on a *year to year* basis, the population is experiencing fluctuating selection where 

In [4]:
comments_test = df_test['Comment'].to_numpy()
print(comments_test[:2])
topics_test = df_test['Topic'].to_numpy()
topics_test[:2]
del df_test

['Personally I have no idea what my IQ is. I’ve never been tested. However, the test is an outdated, inaccurate, inappropriate measuring tool that has been largely abandoned by actual science. Only Mensa cares and their members tend to be insufferable misogynistic and racist assholes. So. Ya know. Go off I guess?'
 "I'm skeptical. A heavier lid would be needed to build pressure, while a lighter lid is needed to move a lot with the release of pressure. I feel like I am missing something here."]


In [5]:
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

from nltk.tokenize import word_tokenize

def trim_data(comments):
  trimmed_comments = []
  for comment in comments:
    comment = comment.lower()
    tokens = word_tokenize(comment)
    comment = [word for word in tokens if not word in set(stopwords)]
    comment = ' '.join(comment)
    trimmed_comments.append(comment)
  return trimmed_comments

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
trimmed_comments_train = trim_data(comments_train)
trimmed_comments_train = np.array(trimmed_comments_train)
print(trimmed_comments_train[:2])

['things . might negative- frequency dependent selection going least common phenotype , reflected genotype , going advantage environment . instance , prey animal vole light dark phenotype , predator might recognize common phenotype food . light voles common , foxes may keeping closer eye light phenotypic voles , recognising good prey . would reduce light causing alleles due increased predation dark genotypes would increase proportion population scenario reversed . cycle continues perpetually . \\n\\nhowever , unlikely strictly yearly usually takes time year entire populations allele frequencies change enough make large enough difference alter fitness . \\n\\nmore likely * year year * basis , population experiencing fluctuating selection alternating conditions environment favor one genotype another . perhaps plant species living area flooded every year two phenotypes population plants much better dryer year one better wet year . flooding , dry-type genotype fitness leading offspring the

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings(action = 'ignore')


comments_expded = ''
for i in range(len(trimmed_comments_train)):
  replacement = ' ' + topics_train[i] + ' '
  comments_expded += trimmed_comments_train[i].replace(' ', replacement)
print(len(comments_expded))

data = []
for sent in sent_tokenize(comments_expded):
    temp = []
     
    for word in word_tokenize(sent):
        temp.append(word)
    # temp = np.array(temp)
    data.append(temp)

# data = np.array(data)
# print(data)
word2vec = Word2Vec(data)

2482306


In [8]:
print(word2vec.wv.similarity('radiation', 'Chemistry'))
print(word2vec.wv.similarity('radiation', 'Biology'))
print(word2vec.wv.similarity('radiation', 'Physics'))

0.17754352
0.2700464
0.40304804


In [9]:
trimmed_comments_test = trim_data(comments_test)
trimmed_comments_test = np.array(trimmed_comments_test)
print(trimmed_comments_test[:2])

['personally idea iq . ’ never tested . however , test outdated , inaccurate , inappropriate measuring tool largely abandoned actual science . mensa cares members tend insufferable misogynistic racist assholes . . ya know . go guess ?'
 "'m skeptical . heavier lid would needed build pressure , lighter lid needed move lot release pressure . feel like missing something ."]


In [10]:
tfidf = TfidfVectorizer()
response = tfidf.fit_transform(trimmed_comments_test)
feature_names = tfidf.get_feature_names()
# for col in response.nonzero()[1]:
    # print (feature_names[col], ' - ', response[0, col])
# print("Feature Names n",feature_names)
print('Input len', len(trimmed_comments_test))
print('Sparse Matrix len', response.shape)

Input len 1586
Sparse Matrix len (1586, 11775)


In [11]:
response = response.toarray()

In [12]:
test_comments = np.zeros((len(trimmed_comments_test), 3))

for comment_indx, comment in enumerate(trimmed_comments_test):
  for word in word_tokenize(comment):
    sim = ['', '', '']
    try:
      index = feature_names.index(word)
      word_tfidf = response[comment_indx, index]
      test_comments[comment_indx][0] += word2vec.wv.similarity(word, 'Chemistry')
      test_comments[comment_indx][1] += word2vec.wv.similarity(word, 'Biology')
      test_comments[comment_indx][2] += word2vec.wv.similarity(word, 'Physics')
      test_comments[comment_indx] = test_comments[comment_indx] * word_tfidf
    except:
      pass
print(test_comments)

[[0.03368689 0.05847032 0.04007046]
 [0.05698991 0.04642279 0.02282803]
 [0.02006182 0.02081264 0.02118429]
 ...
 [0.06068418 0.05273878 0.03974868]
 [0.03189857 0.05305875 0.08028819]
 [0.11389891 0.11356571 0.23127997]]


In [13]:
from scipy.special import comb
def count_of_elements(List):
    temp = {}
    for item in List:
        if item in temp:
            temp[item] += 1
        else:
            temp[item] = 1
    return temp

def calc_elements_in_each_cluster(element, label, predict):
    temp = {}
    for i in range(len(label)):
        if label[i] == element:
            if predict[i] in temp:
                temp[predict[i]] += 1
            else:
                temp[predict[i]] = 1
    return temp

def randIndex(label, predict):
    if len(label) != len(predict):
        return -1
    clusters = {}
    N = len(label)
        

    for i in range(N):
        if predict[i] not in clusters:
            clusters[predict[i]] = []
        
        clusters[predict[i]].append(label[i])

    tp_fp = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for key, value in clusters.items():
        tp_fp += comb(len(value), 2)
        for key1, value1 in count_of_elements(value).items():
            tp += comb(value1, 2)
        fp = tp_fp - tp

    checked = []

    for element in label:
        if element not in checked:
            fn_temp = comb(10, 2)
            for key, value in calc_elements_in_each_cluster(element, label, predict).items():
                fn_temp -= comb(value, 2)

            fn += fn_temp

            checked.append(element)


    tn = comb(N, 2) - (fn + tp + fp)
    fn = (tp + tn) / comb(N, 2)

    ri = (tp + tn) / comb(N, 2)
    
    return ri

In [14]:
results = []
for item in test_comments:
  label = np.argmax(item)
  if label == 0:
    results.append('Chemistry')
  elif label == 1:
    results.append('Biology')
  elif label == 2:
    results.append('Physics')
    
results = np.array(results)

acc = randIndex(topics_test, results)
print(acc)

0.9277137094688939
