In [15]:
import findspark
findspark.init()

from pyspark.ml.feature import Word2Vec
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

from pyspark import SparkContext
from pyspark.streaming import StreamingContext #Import streaming context
from pyspark.sql import SparkSession

import spacy
from pyspark.sql import functions as F
from pyspark.ml.functions import array_to_vector

import numpy as np
import spacy

import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import torch
import spacy
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch import nn

tqdm.pandas()
encoder = LabelEncoder()

nlp = spacy.load('en_core_web_lg')

In [3]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName('explore')\
    .getOrCreate()
sc = spark.sparkContext._conf.setAll([('spark.driver.maxResultSize', '8g')])

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/07 13:45:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.json('/common/users/shared/cs543_fall22_group3/combined/combined_raw')

                                                                                

In [13]:
def vectorize(text):
    return ','.join(nlp(str(text)).vector)

vectorize_udf = F.udf(lambda z: vectorize(z))
processed_df = df.withColumn("vector", F.split(vectorize_udf(F.col("selected_text")), ",")).collect()

In [14]:
processed_df.printSchema()

root
 |-- selected_text: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [16]:
class TextClassificationModel(nn.Module):

    def __init__(self, num_class, embed_dim=300, vocab_size=45, pad_index=0,
                 stride=1, kernel_size=3, conv_out_size=64, dropout_rate=0.25):
        super(TextClassificationModel, self).__init__()

        # Embedding layer parameters
        self.embed_size = embed_dim
        self.vocab_size = vocab_size
        self.pad_index = pad_index
       
        # Conv layer parameters
        self.stride = stride
        self.kernel_size = kernel_size
        self.conv_out_size = conv_out_size
       
        # Misc
        self.dropout_rate = dropout_rate
        
        self.embed_size = 1
        # Layers
        self.conv = torch.nn.Conv1d(self.embed_size, self.conv_out_size, self.kernel_size, self.stride)
        self.hidden_act = torch.relu
        self.max_pool = torch.nn.MaxPool1d(self.kernel_size, self.stride)
       
        self.flatten = lambda x: x.view(x.shape[0], x.shape[1]*x.shape[2])
       
        self.fc = torch.nn.Linear(self._linear_layer_in_size(), num_class)

        if self.dropout_rate:
            self.dropout = torch.nn.Dropout(self.dropout_rate)

    def _linear_layer_in_size(self):
        out_conv_1 = ((self.embed_size - 1 * (self.kernel_size - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_size - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)
                           
        # return out_pool_1*self.conv_out_size
        return 18944

    def forward(self, x):
        # print(x.shape)

        # x = torch.reshape(x. (x.shape[0],)

        x = torch.unsqueeze(x, 1)
        # x = torch.transpose(x, 1, 2) # (batch, 1, 300)

        x = self.conv(x)
        # print(x.shape)

        x = self.hidden_act(x)
        # print(x.shape)

        x = self.max_pool(x)
        # print(x.shape)

        x = self.flatten(x)
        # print(x.shape)

        if self.dropout_rate:
            x = self.dropout(x)

        x = self.fc(x)

        return x

In [17]:
import torch
import math
m = TextClassificationModel(300, 42)
m.load_state_dict(torch.load('/common/users/shared/cs543_fall22_group3/models/class_model.pt'))

<All keys matched successfully>

In [18]:
m.eval()

TextClassificationModel(
  (conv): Conv1d(1, 64, kernel_size=(3,), stride=(1,))
  (max_pool): MaxPool1d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=18944, out_features=300, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

<tf.Tensor: shape=(96,), dtype=float32, numpy=
array([ 0.08313738,  0.23493402,  0.50700295,  0.20518368, -0.61994565,
       -0.31686923, -0.38639998, -0.47822252,  0.2874061 ,  0.5616789 ,
        0.27226388,  0.23311011,  0.42662886, -0.45091665, -0.0188958 ,
       -0.04201526,  0.51277566, -0.22706264, -0.32356876,  0.1642192 ,
       -0.02346741,  0.01970796,  0.12345982,  0.369655  , -0.18727557,
       -0.22674748,  0.92299557,  0.17571642, -0.04334598,  0.12421735,
       -0.39291257,  0.2203804 , -0.08235582, -0.09592929,  0.5930834 ,
       -0.35781085,  0.49502695, -0.15281105, -0.5987592 ,  0.3305167 ,
        0.25534928,  0.23432654,  0.1319402 , -0.5745461 , -0.42265713,
       -0.00779722,  0.0778453 ,  0.54223657,  0.05138373,  0.3120443 ,
        0.03517745, -0.3613884 , -0.32268447, -0.6085768 , -0.31113058,
        0.32031095,  0.83841497, -0.47435936, -0.4444853 , -0.11046734,
       -0.00772311, -0.46004343,  0.3659531 , -0.4661703 , -0.31928566,
        0.0366398

In [31]:
m(torch.from_numpy(np.array(df.take(1)[0].vector, dtype=np.float32)))

RuntimeError: Given groups=1, weight of size [64, 1, 3], expected input[1, 96, 1] to have 1 channels, but got 96 channels instead