## Mounting the drive:

In [None]:
%cd "/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT"

/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT


## Data Preparation:

In [None]:
# Importing all the modules
import os
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib notebook
%matplotlib inline

In [None]:
# Reading the base line file which has 347 labeled data points
df = pd.read_csv('/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/chat_messages_intents.csv',na_values=' ')
df = df[['body', 'intent_1', 'intent_2']]
df.head(2)

Unnamed: 0,body,intent_1,intent_2
0,Hi piston! My crankshaft depends on your bore ...,Providing information,NAN
1,thank you!,NAN,NAN


In [None]:
# Unique intents 
df['intent_1'].unique(), df['intent_2'].unique()

(array(['Providing information', 'NAN', 'Asking for information', nan],
       dtype=object), array(['NAN', 'Exploration of design parameter values',
        'Effects of design parameters on objectives',
        'Dependencies between design parameters',
        'Selected design parameter values for objective(s)',
        'Monitoring objective values', nan, 'Tradeoff between objectives',
        'Selected design parameter values for objective(s)\r\n',
        'Effects of design parameters on objectives\r\n'], dtype=object))

Takeaway: We have identified 3 unique intents for the first intent and 8 unique intents for the second intent experimental notebook. We plan to remove the NAN/ None of the above intent as it is not contributing much to the analysis.

In [None]:
# Dividing dataset for two questions 
df_Q1 = df[['body', 'intent_1']]
df_Q2 = df[['body', 'intent_2']]
df_Q1.head(3), df_Q2.head(3)

(                                                body                intent_1
 0  Hi piston! My crankshaft depends on your bore ...   Providing information
 1                                         thank you!                     NAN
 2    How low can you go on the piston bore diameter?  Asking for information,
                                                 body                                intent_2
 0  Hi piston! My crankshaft depends on your bore ...                                     NAN
 1                                         thank you!                                     NAN
 2    How low can you go on the piston bore diameter?  Exploration of design parameter values)

In [None]:
# Checking out the final labeled and unlabeled instances for the first df
total_Q1 = len(df_Q1['intent_1'])
unlabeled_instances_Q1 = df_Q1['intent_1'].isnull().sum()
labeled_instances_Q1 = total_Q1 - unlabeled_instances_Q1
print('Labeled instances: ', labeled_instances_Q1, ' and Unlabeled instances: ', unlabeled_instances_Q1)

Labeled instances:  347  and Unlabeled instances:  7343


In [None]:
# Checking out the final labeled and unlabeled instances in the first instance
total_Q2 = len(df_Q2['intent_2'])
unlabeled_instances_Q2 = df_Q2['intent_2'].isnull().sum()
labeled_instances_Q2 = total_Q2 - unlabeled_instances_Q2
print('Labeled instances: ', labeled_instances_Q2, ' and Unlabeled instances: ', unlabeled_instances_Q2)

Labeled instances:  347  and Unlabeled instances:  7343


In [None]:
# Dropping the NAN/None of the above intent
df_Q1.drop(index = df.index[(df['intent_1']=='NAN')], axis=0, inplace = True)
df_Q2.drop(index = df.index[(df['intent_2']=='NAN')], axis=0, inplace = True)
df['intent_1'].unique(), df['intent_2'].unique()

(array(['Providing information', 'NAN', 'Asking for information', nan],
       dtype=object), array(['NAN', 'Exploration of design parameter values',
        'Effects of design parameters on objectives',
        'Dependencies between design parameters',
        'Selected design parameter values for objective(s)',
        'Monitoring objective values', nan, 'Tradeoff between objectives',
        'Selected design parameter values for objective(s)\r\n',
        'Effects of design parameters on objectives\r\n'], dtype=object))

In [None]:
# Checking out the final labeled and unlabeled instances for the first df
total_Q1 = len(df_Q1['intent_1'])
unlabeled_instances_Q1 = df_Q1['intent_1'].isnull().sum()
labeled_instances_Q1 = total_Q1 - unlabeled_instances_Q1
print('Labeled instances in first dataframe: ', labeled_instances_Q1, ' and Unlabeled instances in first dataframe: ', unlabeled_instances_Q1)

Labeled instances:  272  and Unlabeled instances:  7343


In [None]:
# Checking out the final labeled and unlabeled instances in the first instance
total_Q2 = len(df_Q2['intent_2'])
unlabeled_instances_Q2 = df_Q2['intent_2'].isnull().sum()
labeled_instances_Q2 = total_Q2 - unlabeled_instances_Q2
print('Labeled instances in second dataframe: ', labeled_instances_Q2, ' and Unlabeled instances in second dataframe: ', unlabeled_instances_Q2)

Labeled instances:  228  and Unlabeled instances:  7343


In [None]:
# Filling out "UNK_UNK" for unlabeled instances
df_Q1 = df_Q1.fillna("UNK_UNK")
df_Q2 = df_Q2.fillna("UNK_UNK")

In [None]:
# Defining the unlabeled instances dataframe
df_unlabeled_Q1 = df_Q1[df_Q1['intent_1']=='UNK_UNK']
df_unlabeled_Q2 = df_Q2[df_Q2['intent_2']=='UNK_UNK']
df_unlabeled_Q1,df_unlabeled_Q2

(                                               body intent_1
 56              we have excellent quality right now  UNK_UNK
 57    Have you broadcasted your most recent design?  UNK_UNK
 58              we have excellent quality right now  UNK_UNK
 65                           looks like we're done?  UNK_UNK
 68                                    seems like it  UNK_UNK
 ...                                             ...      ...
 7685                             ok no one anything  UNK_UNK
 7686                              increase it to 50  UNK_UNK
 7687              what about thickness of flywheel?  UNK_UNK
 7688                              its at 12, lowest  UNK_UNK
 7689                      raise it a little like 20  UNK_UNK
 
 [7343 rows x 2 columns],
                                                body intent_2
 56              we have excellent quality right now  UNK_UNK
 57    Have you broadcasted your most recent design?  UNK_UNK
 58              we have excellent quality

In [None]:
# Defining the labeled instances dataframe
df_labeled_Q1 = df_Q1[df_Q1['intent_1']!='UNK_UNK']
df_labeled_Q2 = df_Q2[df_Q2['intent_2']!='UNK_UNK']
df_labeled_Q1,df_labeled_Q2

(                                                  body                intent_1
 0    Hi piston! My crankshaft depends on your bore ...   Providing information
 2      How low can you go on the piston bore diameter?  Asking for information
 3    all my fos values are in the hundreds, i suspe...  Asking for information
 4    Hi flywheel! My crankshaft depends on your fly...   Providing information
 5                                            min of 40   Providing information
 ..                                                 ...                     ...
 701                         we are at excelelent again   Providing information
 719         flywheel changed a value and I lost 5 lbs!   Providing information
 729              I decreased my weight parameter a bit   Providing information
 735                                        c=60, dc=15   Providing information
 772  Hey, I need to know what PISTON BORE DIAMETER ...  Asking for information
 
 [272 rows x 2 columns],
             

In [None]:
# Renaming columns to suitable names
df_labeled_Q1.rename(columns = {'intent_1': 'intent', 'body':'text'}, inplace = True)
df_unlabeled_Q1.rename(columns = {'intent_1': 'intent', 'body':'text'}, inplace = True)
df_labeled_Q2.rename(columns = {'intent_2': 'sub_intent', 'body':'text'}, inplace = True)
df_unlabeled_Q2.rename(columns = {'intent_2': 'sub_intent','body':'text'}, inplace = True)
df_labeled_Q1.head(2), df_unlabeled_Q1.head(2), df_labeled_Q2.head(2), df_unlabeled_Q2.head(2)

(                                                text                  intent
 0  Hi piston! My crankshaft depends on your bore ...   Providing information
 2    How low can you go on the piston bore diameter?  Asking for information,
                                              text   intent
 56            we have excellent quality right now  UNK_UNK
 57  Have you broadcasted your most recent design?  UNK_UNK,
                                                 text                                  sub_intent
 2    How low can you go on the piston bore diameter?      Exploration of design parameter values
 3  all my fos values are in the hundreds, i suspe...  Effects of design parameters on objectives,
                                              text sub_intent
 56            we have excellent quality right now    UNK_UNK
 57  Have you broadcasted your most recent design?    UNK_UNK)

In [None]:
# Double checking the intents of labeled dataset
df_labeled_Q1.intent.unique(), df_labeled_Q2.sub_intent.unique()

(array(['Providing information', 'Asking for information'], dtype=object),
 array(['Exploration of design parameter values',
        'Effects of design parameters on objectives',
        'Dependencies between design parameters',
        'Selected design parameter values for objective(s)',
        'Monitoring objective values', 'Tradeoff between objectives',
        'Selected design parameter values for objective(s)\r\n',
        'Effects of design parameters on objectives\r\n'], dtype=object))

In [None]:
# Mapping intents to shorter abbreviations
intent = {'Asking for information': 'Ask_info','Providing information': 'Prov_Info'}
sub_intent = {'Exploration of design parameter values' : 'Explore_params',
        'Effects of design parameters on objectives': 'Effect_ofparams',
        'Effects of design parameters on objectives\r\n' : 'Effect_ofparams',
        'Dependencies between design parameters' : 'Dependencies_bwparams',
        'Selected design parameter values for objective(s)': 'Selected_params',
        'Monitoring objective values':'Monitoring_objectives', 'Tradeoff between objectives': 'Tradeoff_bw_objectives',
        'Selected design parameter values for objective(s)\r\n': 'Selected_params'}      

In [None]:
# Cleaning the labeled dataset
df_labeled_Q1["intent"] = df_labeled_Q1["intent"].map(intent)
df_labeled_Q2["sub_intent"] = df_labeled_Q2["sub_intent"].map(sub_intent)
df_labeled_Q1.reset_index(drop=True, inplace=True)
df_labeled_Q2.reset_index(drop=True, inplace=True)
df_labeled_Q1.head(2), df_labeled_Q2.head(2)

(                                                text     intent
 0  Hi piston! My crankshaft depends on your bore ...  Prov_Info
 1    How low can you go on the piston bore diameter?   Ask_info,
                                                 text       sub_intent
 0    How low can you go on the piston bore diameter?   Explore_params
 1  all my fos values are in the hundreds, i suspe...  Effect_ofparams)

In [None]:
# Cleaning the unlabeled dataset
df_unlabeled_Q1["intent"] = df_unlabeled_Q1["intent"]
df_unlabeled_Q2["sub_intent"] = df_unlabeled_Q2["sub_intent"]
df_unlabeled_Q1.reset_index(drop=True, inplace=True)
df_unlabeled_Q2.reset_index(drop=True, inplace=True)
df_unlabeled_Q1.head(2), df_unlabeled_Q2.head(2)

(                                            text   intent
 0            we have excellent quality right now  UNK_UNK
 1  Have you broadcasted your most recent design?  UNK_UNK,
                                             text sub_intent
 0            we have excellent quality right now    UNK_UNK
 1  Have you broadcasted your most recent design?    UNK_UNK)

In [None]:
# Double checking the intents of labeled and unlabeled dataset
df_labeled_Q1['intent'].unique(), df_unlabeled_Q1['intent'].unique()

(array(['Prov_Info', 'Ask_info'], dtype=object),
 array(['UNK_UNK'], dtype=object))

In [None]:
df_labeled_Q2['sub_intent'].unique(), df_unlabeled_Q2['sub_intent'].unique()

(array(['Explore_params', 'Effect_ofparams', 'Dependencies_bwparams',
        'Selected_params', 'Monitoring_objectives',
        'Tradeoff_bw_objectives'], dtype=object),
 array(['UNK_UNK'], dtype=object))

In [None]:
print('length of df_labeled_Q1: ', len(df_labeled_Q1) , ', length of df_unlabeled_Q1: ', len(df_unlabeled_Q1) , 
      'and length of df_labeled_Q2: ', len(df_labeled_Q2) , ', length of df_unlabeled_Q2: ', len(df_unlabeled_Q2))      

length of df_labeled_Q1:  272 , length of df_unlabeled_Q1:  7343 and length of df_labeled_Q2:  228 , length of df_unlabeled_Q2:  7343


For first problem statement, we have 272 labeled instances ad 7343 unlabeled instances. This accounts to 3.57% of labeled material.
While in case of the second problem statement, we have 228 labeled instances ad 7343 unlabeled instances. This accounts to 3.0% of labeled material.

## Preparing Data Files:



We are dividing the labeled material into train, test and validation split by 70%, 20% and 10%. On top of this, we are dividing the unlabeled data into 60% unlabeled annotations and 40% test-out-of-space samples to finally test the model.


In [None]:
def split_labeled_Q1(df_labeled, df_unlabeled):
  '''
  This function splits the labeled and unlabeled dataset into four files which are listed as follows:
  1. train_Q1.tsv: Contains all the training labeled data points 
  2. test_Q1.tsv: Contains the test utterances
  3. valid_Q1.tsv: Contains the validation utterances
  4. unlabeled_Q1.tsv: Contains all the unlabeled data points
  5. test_OOS_Q1.tsv: Contains the test out-of-space utterances
  '''
  train_fraction = 0.7
  test_fraction = 0.2
  valid_fraction = 0.1
  unlabeled_fraction = 0.6
  split_point_train = int(train_fraction *len(df_labeled))
  split_point_test = split_point_train+int(test_fraction *len(df_labeled))
  split_point_valid = split_point_test+int(valid_fraction *len(df_labeled))
  split_point_unlabeled = int(unlabeled_fraction *len(df_unlabeled))

  df_labeled = df_labeled.sample(frac=1).reset_index(drop=True)
  df_unlabeled = df_unlabeled.sample(frac=1).reset_index(drop=True)

  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q1/train_Q1.tsv','a+') as f_out:
      f_out.write('fine_label utterance'+'\n')
      for i in range(len(df_labeled[:split_point_train])):
          line = ' '.join([str(df_labeled.loc[i,'intent']), str(df_labeled.loc[i,'text'])])
          f_out.write(line+'\n')

  test_data = df_labeled[split_point_train:split_point_test]
  test_data = test_data.reset_index(drop=True)
  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q1/test_Q1.tsv','a+') as f_out:
    f_out.write('fine_label utterance'+'\n')
    for i in range(len(test_data)):
      line = ' '.join([str(test_data.loc[i,'intent']), str(test_data.loc[i,'text'])])
      f_out.write(line+'\n')

  valid_data = df_labeled[split_point_test:split_point_valid]
  valid_data = valid_data.reset_index(drop=True)
  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q1/valid_Q1.tsv','a+') as f_out:
    f_out.write('fine_label utterance'+'\n')
    for i in range(len(valid_data)):
      line = ' '.join([str(valid_data.loc[i,'intent']), str(valid_data.loc[i,'text'])])
      f_out.write(line+'\n')

  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q1/unlabeled_Q1.tsv','a+') as f_out:
      f_out.write('fine_label utterance'+'\n')
      for i in range(len(df_unlabeled[:split_point_unlabeled])):
          line = ' '.join(['UNK_UNK', str(df_unlabeled.loc[i,'text'])])
          f_out.write(line+'\n')

  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q1/test_OOS_Q1.tsv','a+') as f_out:
      f_out.write('fine_label utterance'+'\n')
      for i in range(len(df_unlabeled[split_point_unlabeled:])):
          line = ' '.join(['OOS', str(df_unlabeled.loc[i,'text'])])
          f_out.write(line+'\n')

def split_labeled_Q2(df_labeled, df_unlabeled):
  '''
  This function splits the labeled and unlabeled dataset into four files which are listed as follows:
  1. train_Q2.tsv: Contains all the training labeled data points 
  2. test_Q2.tsv: Contains the test utterances
  3. valid_Q2.tsv: Contains the validation utterances
  4. unlabeled_Q2.tsv: Contains all the unlabeled data points
  5. test_OOS_Q2.tsv: Contains the test out-of-space utterances
  '''
  train_fraction = 0.7
  test_fraction = 0.2
  valid_fraction = 0.1
  unlabeled_fraction = 0.6
  split_point_train = int(train_fraction *len(df_labeled))
  split_point_test = split_point_train+int(test_fraction *len(df_labeled))
  split_point_valid = split_point_test+int(valid_fraction *len(df_labeled))
  split_point_unlabeled = int(unlabeled_fraction *len(df_unlabeled))

  df_labeled = df_labeled.sample(frac=1).reset_index(drop=True)
  df_unlabeled = df_unlabeled.sample(frac=1).reset_index(drop=True)

  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q2/train_Q2.tsv','a+') as f_out:
      f_out.write('fine_label utterance'+'\n')
      for i in range(len(df_labeled[:split_point_train])):
          line = ' '.join([str(df_labeled.loc[i,'sub_intent']), str(df_labeled.loc[i,'text'])])
          f_out.write(line+'\n')

  test_data = df_labeled[split_point_train:split_point_test]
  test_data = test_data.reset_index(drop=True)
  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q2/test_Q2.tsv','a+') as f_out:
    f_out.write('fine_label utterance'+'\n')
    for i in range(len(test_data)):
      line = ' '.join([str(test_data.loc[i,'sub_intent']), str(test_data.loc[i,'text'])])
      f_out.write(line+'\n')

  valid_data = df_labeled[split_point_test:split_point_valid]
  valid_data = valid_data.reset_index(drop=True)
  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q2/valid_Q2.tsv','a+') as f_out:
    f_out.write('fine_label utterance'+'\n')
    for i in range(len(valid_data)):
      line = ' '.join([str(valid_data.loc[i,'sub_intent']), str(valid_data.loc[i,'text'])])
      f_out.write(line+'\n')

  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q2/unlabeled_Q2.tsv','a+') as f_out:
      f_out.write('fine_label utterance'+'\n')
      for i in range(len(df_unlabeled[:split_point_unlabeled])):
          line = ' '.join(['UNK_UNK', str(df_unlabeled.loc[i,'text'])])
          f_out.write(line+'\n')

  with open(f'/content/drive/MyDrive/projects/Capstone_Project/Checkpoint_5/GAN-BERT/Q2/test_OOS_Q2.tsv','a+') as f_out:
      f_out.write('fine_label utterance'+'\n')
      for i in range(len(df_unlabeled[split_point_unlabeled:])):
          line = ' '.join(['OOS', str(df_unlabeled.loc[i,'text'])])
          f_out.write(line+'\n')

split_labeled_Q1(df_labeled_Q1,df_unlabeled_Q1)
split_labeled_Q2(df_labeled_Q2,df_unlabeled_Q2)