# Playing around with Word Embeddings


## 1. Count Vectorized with aspect weights

In [None]:
def apply_aspdep_weight(train_df, weight):
    train_text = train_df[' text'].values.astype('U')
    train_aspdep = train_df['asp_dep_words'].values.astype('U')
    text_count_vect = CountVectorizer()
    x_text_counts = text_count_vect.fit_transform(train_text)
    text_voc = text_count_vect.vocabulary_
    asp_dep_vect = CountVectorizer(vocabulary=text_voc)
    x_aspdep_counts = asp_dep_vect.fit_transform(train_aspdep)
    x_count_vec = x_text_counts + weight * x_aspdep_counts
    x_tfidf_vec = TfidfTransformer(use_idf=True).fit_transform(x_count_vec)
    return x_tfidf_vec

## 2. Count Vectorized with aspect weight distribution 

In [40]:
def extract_aspect_related_words(sdp, ardf):
    print("Extracting aspect related words from text...")
    cols = list(ardf)
    cols.append('asp_dep_words')
    ar_df = pandas.DataFrame(columns=cols)
    count = 0
    for index, row in ardf.iterrows():
        count += 1
        print(count)
        dep_set = set()
        result = list(sdp.raw_parse(row[' text']))
        parse_triples_list = [item for item in result[0].triples()]
        for governor, dep, dependent in parse_triples_list:
            print("G: ", governor, "DEP: ", dep, "depndent: ",dependent)
            if governor[0] in row[' aspect_term'] or dependent[0] in row[' aspect_term']:
                dep_set.add(governor[0])
                dep_set.add(dependent[0])
        ar_row = [row[c] for c in cols[:-1]]
        ar_row.append(' '.join(list(dep_set)))
        ar_df.loc[len(ar_df.index)] = ar_row
    return ar_df

In [46]:
##TEST CELL
from nltk.parse.stanford import StanfordDependencyParser
import pandas
import nltk
nltk.internals.config_java("C:\Program Files\Java\jdk1.8.0_171\\bin\java.exe")
sdp = StanfordDependencyParser(
    path_to_jar="stanford-nlp-jars\stanford-corenlp-full-2018-01-31\stanford-corenlp-3.9.0.jar",
    path_to_models_jar="stanford-nlp-jars\stanford-corenlp-full-2018-01-31\stanford-corenlp-3.9.0-models.jar")
test_df = pandas.read_csv('test_1.csv', sep='\t')
print(extract_aspect_related_words(sdp, test_df[:5]))


Extracting aspect related words from text...
1
G:  ('features', 'VBZ') DEP:  advmod depndent:  ('Obviously', 'RB')
G:  ('features', 'VBZ') DEP:  nsubj depndent:  ('one', 'PRP')
G:  ('features', 'VBZ') DEP:  advmod depndent:  ('important', 'JJ')
G:  ('features', 'VBZ') DEP:  dobj depndent:  ('interface', 'NN')
G:  ('interface', 'NN') DEP:  compound depndent:  ('computer', 'NN')
G:  ('interface', 'NN') DEP:  amod depndent:  ('human', 'JJ')
2
G:  ('browsing', 'VBD') DEP:  nsubj depndent:  ('web', 'NN')
G:  ('web', 'NN') DEP:  amod depndent:  ('Good', 'JJ')
G:  ('Good', 'JJ') DEP:  nmod:tmod depndent:  ('day', 'NN')
G:  ('day', 'NN') DEP:  det depndent:  ('every', 'DT')
G:  ('web', 'NN') DEP:  amod depndent:  ('computing', 'VBG')
3
G:  ('makes', 'VBZ') DEP:  nsubj depndent:  ('alright', 'NN')
G:  ('alright', 'NN') DEP:  compound depndent:  ('keyboard', 'NN')
G:  ('alright', 'NN') DEP:  appos depndent:  ('plate', 'NN')
G:  ('plate', 'NN') DEP:  nmod depndent:  ('plastic', 'NN')
G:  ('plasti

# Word embedding based on co-ocurrence

In [50]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def apply_aspdep_weight(train_df):
    train_text = train_df[' text'].values.astype('U')
#     train_aspdep = train_df['asp_dep_words'].values.astype('U')
    text_count_vect = CountVectorizer()
    x_text_counts = text_count_vect.fit_transform(train_text)
    Xc = (x_text_counts.T * x_text_counts)
    Xc.setdiag(0)
    print(pd.DataFrame(Xc.todense(), 
    columns=text_count_vect.get_feature_names(), 
    index=text_count_vect.get_feature_names()))
apply_aspdep_weight(test_df)

             10  11  13  159  16  20  2008  2011  24  250 ...   would  wouldn  \
10            0   0   0    0   0   1     0     0   0    0 ...       1       0   
11            0   0   0    0   0   0     0     0   0    0 ...       0       0   
13            0   0   0    0   0   0     0     0   0    0 ...       0       0   
159           0   0   0    0   0   0     1     0   0    0 ...       1       0   
16            0   0   0    0   0   0     0     0   0    0 ...       0       0   
20            1   0   0    0   0   0     0     0   0    0 ...       0       0   
2008          0   0   0    1   0   0     0     0   0    0 ...       1       0   
2011          0   0   0    0   0   0     0     0   0    0 ...       0       0   
24            0   0   0    0   0   0     0     0   0    0 ...       0       0   
250           0   0   0    0   0   0     0     0   0    0 ...       0       0   
300           0   0   0    0   0   0     0     0   0    0 ...       0       0   
320gb         0   0   0    0