In [1]:
import pandas as pd
import time, datetime, numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import Counter

## Mounting Google Drive to Collab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/[CS4248] Project Folder/data/esnli_train.csv')
val = pd.read_csv('/content/drive/MyDrive/[CS4248] Project Folder/data/esnli_val.csv')
test = pd.read_csv('/content/drive/MyDrive/[CS4248] Project Folder/data/esnli_test.csv')

Mounted at /content/drive


## Corpus Analysis

#### Analaysis Utility Functions

In [24]:
def get_word_statistics(df, col_name):
  column = df[col_name].dropna()
  word_counts = column.apply(lambda x: len(x.split()))

  results = {
      "avg_word_count": np.mean(word_counts),
      "median_word_count": np.median(word_counts),
      "std_dev_word_count": np.std(word_counts)
  }

  return results

def get_unqiue_vocabs(df, col_name):
  column = df[col_name].dropna()
  vocab_set = set()
  for sentence in column:
    vocab_set.update(sentence.split())

  return vocab_set

def get_vocab_statistics(df, col_name):
  column = df[col_name].dropna()
  vocab_counts = column.apply(lambda x: len(set(x.split())))

  results = {
      "avg_vocab_count": np.mean(vocab_counts),
      "median_vocab_count": np.median(vocab_counts),
      "std_dev_vocab_count": np.std(vocab_counts),
      "vocab_size": len(get_unqiue_vocabs(df, col_name)),
      "normalized_vocab_size": len(get_unqiue_vocabs(df, col_name))/len(column)
  }

  return results

def print_statistics(df, df_name, target_cols):
    print(f"Statistics for DataFrame: {df_name}")
    for col_name in target_cols:
        print(f"Column: {col_name}")
        word_stats = get_word_statistics(df, col_name)
        vocab_stats = get_vocab_statistics(df, col_name)
        print("Word Count Statistics:")
        print("  Average:", word_stats["avg_word_count"])
        print("  Median:", word_stats["median_word_count"])
        print("  Standard Deviation:", word_stats["std_dev_word_count"])
        print("Vocabulary Count Statistics:")
        print("  Average:", vocab_stats["avg_vocab_count"])
        print("  Median:", vocab_stats["median_vocab_count"])
        print("  Standard Deviation:", vocab_stats["std_dev_vocab_count"])
        print("  Vocabulary Size:", vocab_stats["vocab_size"])
        print("  Normalized Vocabulary Size:", vocab_stats["normalized_vocab_size"])
        print()

print_statistics(df, "df", ['Sentence1', 'Sentence2', 'Explanation_1'])
print_statistics(val, "val", ['Sentence1', 'Sentence2', 'Explanation_1', 'Explanation_2', 'Explanation_3'])
print_statistics(test, "test", ['Sentence1', 'Sentence2', 'Explanation_1', 'Explanation_2', 'Explanation_3'])

Statistics for DataFrame: df
Column: Sentence1
Word Count Statistics:
  Average: 12.81492236508602
  Median: 12.0
  Standard Deviation: 5.673234137256183
Vocabulary Count Statistics:
  Average: 11.89091111888892
  Median: 11.0
  Standard Deviation: 4.594794723883672
  Vocabulary Size: 23894
  Normalized Vocabulary Size: 0.09190035346289793

Column: Sentence2
Word Count Statistics:
  Average: 7.398163817904891
  Median: 7.0
  Standard Deviation: 3.049651862169359
Vocabulary Count Statistics:
  Average: 7.243938368282589
  Median: 7.0
  Standard Deviation: 2.8062685308215864
  Vocabulary Size: 36777
  Normalized Vocabulary Size: 0.14145217618732595

Column: Explanation_1
Word Count Statistics:
  Average: 12.698958024824318
  Median: 11.0
  Standard Deviation: 6.453918558510613
Vocabulary Count Statistics:
  Average: 11.507983091462265
  Median: 11.0
  Standard Deviation: 5.037312336567891
  Vocabulary Size: 57706
  Normalized Vocabulary Size: 0.22195725170873928

Statistics for DataFrame