In [2]:
import pandas as pd
import os
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

In [3]:
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)
from utils import read_from_zip

In [4]:
data_folder = Path('data')
train_zip = 'train.zip'
test_zip = 'test.zip'

In [5]:
train = read_from_zip(data_folder.joinpath(train_zip))
test = read_from_zip(data_folder.joinpath(test_zip))

In [6]:
X = train['text']
y = train['author']

In [9]:
GLOVE = Path(r"C:\Users\ABansal4\OneDrive - Schlumberger\auto-nlp-platform\data\embeddings\glove.6B\glove.6B.300d.txt")
WORD2VEC = Path(r"C:\Users\ABansal4\OneDrive - Schlumberger\auto-nlp-platform\data\embeddings\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin")

In [48]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import KeyedVectors
from nltk import word_tokenize, sent_tokenize, WordNetLemmatizer, pos_tag, wordpunct_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

In [59]:
STOPWORDS = stopwords.words('english')
OOD = set()

def sent2vec(sent_tokens, embeddings_index, dim):
    """
    Add an encoded class label to the given dataframe

    Parameters
    ----------
    sent_tokens: List(str), [token1, token2, ....token_n]
        A sentence splitted into tokens which has to be vectorized
    embeddings_index : dict, {str: array(float)}, {word: [embedding array]}
        Embedding dictionary used to loopup word embeddings
    dim: int
        Number of dimension in embedding vector

    Returns
    ----------
    emb_vec : numpy array, shape (dim,)
        Normalized embedding vector corresponding to sum of all the word vectors in the given sentence
    """
    global OOD
    print(len(OOD))
    words = str(sent_tokens).lower() #.decode('utf-8')
    words = word_tokenize(words, language='english')
    sent_tokens = [w for w in words if (w not in STOPWORDS) & (w.isalpha())]
    M = []
    for w in sent_tokens:
        try:
            M.append(embeddings_index[w])
        except KeyError:
            OOD.update([w])
#             print('Word (token) %s not found in provided embedding dictionary' %w)

    M = np.array(M)
    v = M.sum(axis=0)                   #Sum all the rows across column
    if type(v) != np.ndarray:
        return np.zeros(dim)
    # Return a normalized vector
    emb_vec = v / np.sqrt((v ** 2).sum())
    return emb_vec

def load_embeddings_from_txt(embedding_file):
    """
    Reads a text embedding file into a dictionary with keys as words.

    Parameters
    ----------
    embedding_file: str, Path like
        Path to the embedding file
    
    Returns
    ----------
    embeddings_index : dict, {word(str): vector(ndarray(dim,))}
        Dictionary of word vectors.
    """
    embeddings_index = dict()
    f = open(embedding_file, encoding="utf-8")
    #Transfer the embedding weights into a dictionary by iterating through every line of the file.
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def load_embedding_index(embedding_file):
    """
    Reads a embedding file into a dictionary with keys as words.

    Parameters
    ----------
    embedding_file: Path like object
        Path to the embedding file
    
    Returns
    ----------
    embeddings_index : dict, {word(str): vector(ndarray(dim,))}
        Dictionary of word vectors.

    Notes
    Param:  embeddings_type: str, ['glove', 'word2vec'], deprecated
                supported type of embeddings. Now loading word vectors based on file extension.
    File extension based loading also supports loading a dictionary pickle.
    """
    file_type = embedding_file.suffix
    assert file_type != ''
    if file_type == '.bin':
    # if embeddings_type == "word2vec":
        w2v_model = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
        embeddings_index = dict()
        for word in w2v_model.wv.vocab:
            embeddings_index[word] = w2v_model.word_vec(word)
    elif file_type == '.txt':
    # elif embeddings_type == "glove":
        embeddings_index = load_embeddings_from_txt(embedding_file)
    elif file_type == '.pickle':
        embeddings_index = load(embedding_file)
    else:
        raise ValueError('Could not load specified embedding file. Supported files are .txt, .bin & .pickle')

    assert isinstance(embeddings_index, dict) == True
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index

In [44]:
EMBEDDINGS = load_embedding_index(GLOVE)

400000it [02:47, 2391.09it/s]

Loaded 400000 word vectors.





In [60]:
X_embeddings = np.array([sent2vec(sent, EMBEDDINGS, dim=300) for sent in X])

0
0
0
1
1
1
2
2
4
5
5
5
5
5
5
5
5
5
6
6
7
7
7
8
11
11
11
12
12
12
13
13
13
13
13
13
13
13
13
13
13
13
13
18
19
20
20
21
21
22
22
22
23
23
23
23
23
23
24
24
25
25
25
25
25
25
25
26
26
26
27
27
27
27
27
27
27
27
27
27
27
27
27
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
31
31
31
31
33
35
35
35
35
36
36
36
36
36
36
36
36
37
38
38
38
38
38
38
38
38
39
39
39
39
39
39
39
39
39
39
39
39
41
41
41
41
41
41
41
43
43
43
43
43
43
43
43
43
43
43
44
45
45
45
45
45
45
45
45
46
47
47
48
48
48
48
48
48
48
48
48
48
48
48
49
49
51
52
54
54
54
54
56
56
57
58
58
58
59
59
59
59
59
59
60
60
60
60
60
60
60
60
60
60
60
60
60
61
61
61
61
61
61
61
62
62
62
63
63
63
63
63
64
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
66
67
67
67
67
76
76
76
76
76
76
76
76
76
76
77
77
77
78
78
78
78
78
78
78
78
79
79
79
79
79
82
82
82
82
83
85
85
87
88
88
88
89
89
90
91
93
93
94
95
95
95
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
97
98
98
98
98
98
98
98
98
98
98
98
98
98
98
99
100
101
10

459
459
459
459
459
459
459
459
459
459
459
459
459
459
459
460
460
460
460
460
460
460
460
460
461
461
461
462
462
463
463
463
463
463
463
463
463
463
463
464
464
464
464
464
466
467
467
467
467
467
467
468
468
469
469
469
469
469
469
469
470
470
470
470
470
470
471
471
471
472
472
472
472
472
472
472
472
472
472
472
472
473
473
473
473
473
473
473
473
473
473
474
474
474
474
474
474
474
474
475
476
476
476
476
476
476
476
476
476
477
478
478
478
479
479
479
479
480
480
480
481
481
481
481
481
481
481
481
481
481
481
481
481
484
484
484
484
484
484
484
484
485
485
485
485
485
485
485
485
485
486
486
486
486
486
486
487
487
488
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
489
490
490
490
491
491
491
491
491
491
491
491
491
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
492
493
493
493
493
493
495
495
495
495
495
495
495
495
495
495
499
500
500
500
502
502
503
503
503
503
503
503
503


814
814
814
815
815
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
816
821
821
821
821
821
821
821
821
821
821
821
821
821
821
821
821
822
822
822
822
822
822
822
822
822
822
822
822
822
822
822
824
824
825
825
825
825
825
825
825
825
825
825
826
826
827
827
827
827
828
828
828
829
829
829
829
829
829
830
830
830
830
830
830
830
830
830
830
830
830
830
830
830
830
830
830
830
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
831
835
835
835
835
835
835
835
835
836
836
836
836
837
837
837
837
837
837
837
837
837
838
838
838
838
838
838
838
838
838
838
838
838
838
838
838
839
840
840
840
840
840
841
841
841
841
841
841
841
841
841
842
842
842
842
842
842
842
842
842
842
842
842
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
843
844
844
844
844
844
844
844
844
844
844
844
844
844
844
844
844
844
845
845
845
845
845
845
846
846
846
846
846
846
846
846
846
846
846
846
847
847


1094
1095
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1096
1097
1097
1097
1097
1097
1097
1097
1097
1098
1098
1098
1098
1098
1099
1099
1100
1100
1100
1100
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1101
1102
1104
1104
1105
1105
1105
1105
1106
1106
1106
1106
1106
1106
1106
1106
1106
1106
1106
1106
1106
1107
1107
1107
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1109
1110
1111
1111
1111
1111
1111
1111
1111
1111
1111
1111
1111
1111
1111
1111
1111
1112
1112
1112
1112
1112
1112
1112
1112
1112
1112
1112
1112
1112
1113
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1114
1115
1115
1115
1115
1115
1115
1115


1289
1289
1289
1289
1290
1290
1290
1290
1290
1290
1290
1290
1290
1290
1290
1291
1292
1293
1293
1293
1294
1294
1294
1294
1294
1294
1294
1294
1294
1294
1294
1294
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1302
1303
1304
1304
1304
1304
1304
1304
1304
1304
1304
1304
1304
1304
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1306
1307
1308
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1309
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1310
1311
1311
1311
1311
1311
1311
1311
1311
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1312
1313
1313
1313
1313
1313
1313
1314
1314
1314
1314
1314
1314
1314
1314
1315
1315
1315
1315
1315
1315
1315
1315
1315
1315
1315
1315
1315
1316
1316
1316
1316
1316
1317
1317
1317
1317
1317
1317


1506
1506
1506
1506
1506
1506
1506
1506
1506
1506
1506
1506
1507
1508
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1509
1511
1511
1511
1511
1511
1511
1511
1511
1511
1511
1512
1512
1512
1512
1512
1512
1512
1512
1512
1512
1512
1513
1513
1513
1513
1513
1513
1513
1513
1513
1513
1513
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1514
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1516
1517
1517
1517
1518
1518
1518
1518
1518
1518
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1519
1520
1520
1521
1521
1522
1522
1523
1523
1523
1523
1523
1523
1523
1523
1523
1523
1523
1523
1523
1523


1688
1688
1689
1690
1690
1690
1690
1690
1690
1690
1690
1691
1691
1691
1691
1692
1692
1692
1692
1692
1693
1693
1693
1693
1693
1693
1694
1694
1694
1694
1694
1694
1694
1695
1695
1695
1695
1695
1695
1695
1695
1695
1695
1696
1696
1696
1696
1697
1697
1697
1697
1697
1697
1698
1699
1699
1699
1699
1699
1699
1699
1699
1699
1699
1700
1701
1701
1701
1701
1701
1701
1701
1701
1701
1701
1701
1701
1702
1702
1702
1702
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1703
1704
1704
1704
1704
1704
1704
1704
1704
1704
1704
1704
1704
1704
1705
1705
1705
1705
1705
1705
1705
1706
1706
1706
1706
1706
1706
1706
1707
1707
1707
1707
1707
1707
1707
1709
1709
1709
1709
1709
1709
1709
1709
1709
1709
1709
1709
1709
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1710
1712
1712
1712
1712
1712
1712
1712
1712
1712
1713
1713
1713
1713
1713
1713
1713
1713
1713
1714
1714
1714
1714
1714
1714
1714
1714
1714
1714
1714


1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1882
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1883
1884
1884
1884
1884
1884
1884
1884
1884
1884
1884
1884
1885
1886
1886
1886
1886
1886
1886
1886
1886
1886
1886
1886
1886
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1887
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1888
1889
1889
1889
1889
1889
1889
1889
1889
1889
1889
1895
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1896
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1897
1898
1898
1898
1898
1899
1899
1899
1899
1899
1899
1899
1899
1900
1900
1900
1900
1900
1900


2075
2075
2075
2075
2075
2075
2075
2075
2075
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2076
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2077
2078
2078
2078
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2079
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2080
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2081
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2082
2083
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084
2084


2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2221
2223
2223
2223
2223
2223
2223
2223
2223
2223
2223
2225
2225
2225
2225
2225
2225
2225
2225
2226
2227
2227
2227
2227
2227
2227
2227
2227
2228
2228
2228
2228
2228
2228
2228
2228
2228
2228
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2229
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2230
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2231
2232
2232
2232
2232
2232
2233
2233
2233
2233
2233
2233
2234
2234
2234
2234
2234
2234
2235
2235
2235
2235
2235
2236
2238
2238
2238
2238
2238
2238
2238
2238
2238
2238
2238
2238
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239
2239


2410
2410
2410
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2411
2412
2412
2412
2412
2412
2412
2412
2412
2412
2412
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2413
2414
2414
2414
2414
2414
2414
2414
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2415
2416
2416
2416
2416
2416
2416
2416
2416
2416
2416
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2417
2418
2418
2418
2418
2418
2418
2418
2418
2418
2418
2420
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2421
2422
2422
2422
2422
2422
2422
2422
2423
2423
2423
2423
2423
2423
2423
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2424
2425
2425
2425
2425
2425
2425
2425
2425
2425
2425
2425
2425
2425
2425
2425
2426


In [61]:
OOD

{'cloathing',
 'montanvert',
 'instructress',
 'stenches',
 'babylonish',
 'injurer',
 'moissart',
 'repulsively',
 'inspiriting',
 'cathuria',
 'returneth',
 'perticcler',
 'pxll',
 'nthlei',
 'affright',
 'unparticled',
 'nephren',
 'sxrrxws',
 'emicant',
 'grovelled',
 'zobna',
 'breakless',
 'penstruthal',
 'mennais',
 'ignoratio',
 'borellus',
 'magnetoesthetics',
 'instantiae',
 'traordinary',
 'puseyism',
 'horrorless',
 'supererogation',
 'pasquinade',
 'uprearing',
 'otterholm',
 'overburthened',
 'putridity',
 'vociferate',
 'currycomb',
 'receeded',
 'certum',
 'simia',
 'feerd',
 'drawlingly',
 'unsaddled',
 'curius',
 'silhouetting',
 'indite',
 'hasheesh',
 'baound',
 'falteringly',
 'mowin',
 'kadatheron',
 'convincingness',
 'druv',
 'asphodels',
 'helvia',
 'recontre',
 'compar',
 'cataleptical',
 'madnesses',
 'haow',
 'imbibes',
 'miltonic',
 'unfaded',
 'pourtrayed',
 'frxg',
 'valentinianus',
 'decadents',
 'knowest',
 'foppery',
 'equerries',
 'cerements',
 'creat

In [86]:
w = 'fxxl'
print(train[X.str.lower().str.contains(w)]['text'].values)
print(train[X.str.lower().str.contains(w)]['author'].values)

["Cxxl, nxw cxxl Dx be cxxl, yxu fxxl Nxne xf yxur crxwing, xld cxck Dxn't frxwn sx dxn't Dxn't hxllx, nxr hxwl, nxr grxwl, nxr bxw wxw wxw Gxxd Lxrd, Jxhn, hxw yxu dx lxxk Txld yxu sx, yxu knxw, but stxp rxlling yxur gxxse xf an xld pxll abxut sx, and gx and drxwn yxur sxrrxws in a bxwl' The uproar occasioned by this mystical and cabalistical article, is not to be conceived."]
['EAP']


In [87]:
w = 'mxther'
print(train[X.str.lower().str.contains(w)]['text'].values)
print(train[X.str.lower().str.contains(w)]['author'].values)

["Dxn't crxw, anxther time, befxre yxu're xut xf the wxxds Dxes yxur mxther knxw yxu're xut?"]
['EAP']


In [69]:
train2 = pd.read_csv('data/train.csv')

In [89]:
train2[X.str.lower().str.contains('mxther')]

Unnamed: 0,id,text,author
18364,id14727,"Dxn't crxw, anxther time, befxre yxu're xut xf...",EAP
