In [None]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


In [None]:
def random_colours(number_of_colors):
    '''
    Simple function for random colours generation.
    Input:
        number_of_colors - integer value indicating the number of colours which are going to be generated.
    Output:
        Color in the following format: ['#E86DA4'] .
    '''
    colors = []
    for i in range(number_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [None]:
sick_data = pd.read_csv(r"C:/Users/me1awq/PhD/docsim/datasets/open_source/sick.csv")

In [None]:
print(sick_data.shape)
print(sick_data.shape)

In [None]:
sick_data.info()

In [None]:
sick_data.dropna(inplace=True)

In [None]:
sick_data.head()

In [None]:
sick_data.describe()

In [None]:
temp = sick_data.groupby('entailment_judgment').count()['sentence_A'].reset_index().sort_values(by='sentence_A',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='entailment_judgment',data=sick_data)

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
results_jaccard=[]

for ind,row in sick_data.iterrows():
    sentence1 = row.sentence_A
    sentence2 = row.sentence_B

    jaccard_score = jaccard(sentence1,sentence2)
    results_jaccard.append([sentence1,sentence2,jaccard_score])

In [None]:
jaccard = pd.DataFrame(results_jaccard,columns=["row.sentence_A","row.sentence_B","jaccard_score"])
sick_data = sick_data.merge(jaccard, how="outer", left_index=True, right_index=True)

In [None]:
sick_data['Num_words_Sentence_B'] = sick_data['row.sentence_B'].apply(lambda x:len(str(x).split())) #Number Of words in Sentence B
sick_data['Num_words_Sentence_A'] = sick_data['row.sentence_A'].apply(lambda x:len(str(x).split())) #Number Of words in Sentence B
sick_data['difference_in_words'] = sick_data['Num_words_Sentence_A'] - sick_data['Num_words_Sentence_B'] #Difference in Number of words text and Selected Text

In [None]:
sick_data.head()

In [None]:
hist_data = [sick_data['Num_words_Sentence_B'],sick_data['Num_words_Sentence_A']]

group_labels = ['row.sentence_B', 'row.sentence_A']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_curve=False)
fig.update_layout(title_text='Distribution of Number Of words')
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(sick_data['Num_words_Sentence_B'], shade=True, color="r").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(sick_data['Num_words_Sentence_A'], shade=True, color="b")

In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(sick_data[sick_data['entailment_judgment']=='ENTAILMENT']['difference_in_words'], shade=True, color="b").set_title('Kernel Distribution of Difference in Number Of words')
p2=sns.kdeplot(sick_data[sick_data['entailment_judgment']=='CONTRADICTION']['difference_in_words'], shade=True, color="r")
p3=sns.kdeplot(sick_data[sick_data['entailment_judgment']=='NEUTRAL']['difference_in_words'], shade=True, color="g")

In [None]:
k =sick_data[sick_data['Num_words_Sentence_A']<=20]

In [None]:
k.groupby('entailment_judgment').mean()['jaccard_score']

In [None]:
k[k['entailment_judgment']=='ENTAILMENT']

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
sick_data['sentence_A'] = sick_data['sentence_A'].apply(lambda x:clean_text(x))
sick_data['sentence_B'] = sick_data['sentence_B'].apply(lambda x:clean_text(x))

In [None]:
sick_data.head()

In [None]:
sick_data['temp_list'] = sick_data['sentence_B'].apply(lambda x:str(x).split())
top = Counter([item for sublist in sick_data['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
# fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in sentence_B', orientation='h', 
#              width=700, height=700,color='Common_words')
# fig.show()

In [None]:
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
sick_data['temp_list'] = sick_data['temp_list'].apply(lambda x:remove_stopword(x))

In [None]:
top = Counter([item for sublist in sick_data['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')

In [None]:
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Words')
fig.show()

In [None]:
sick_data['temp_list1'] = sick_data['sentence_A'].apply(lambda x:str(x).split()) #List of words in every row for text
sick_data['temp_list1'] = sick_data['temp_list1'].apply(lambda x:remove_stopword(x)) #Removing Stopwords

In [None]:
top = Counter([item for sublist in sick_data['temp_list1'] for item in sublist])
temp = pd.DataFrame(top.most_common(25))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
Positive_sent = sick_data[sick_data['entailment_judgment']=='ENTAILMENT']
Negative_sent = sick_data[sick_data['entailment_judgment']=='CONTRADICTION']
Neutral_sent = sick_data[sick_data['entailment_judgment']=='NEUTRAL']

In [None]:
#MosT common positive words
top = Counter([item for sublist in Positive_sent['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(30))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')

In [None]:
fig = px.bar(temp_positive, x="count", y="Common_words", title='Most Commmon Positive Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
#MosT common negative words
top = Counter([item for sublist in Negative_sent['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')

In [None]:
fig = px.treemap(temp_negative, path=['Common_words'], values='count',title='Common Contradicting Words')
fig.show()

In [None]:
#MosT common Neutral words
top = Counter([item for sublist in Neutral_sent['temp_list'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(20))
temp_neutral = temp_neutral.loc[1:,:]
temp_neutral.columns = ['Common_words','count']
temp_neutral.style.background_gradient(cmap='Reds')

In [None]:

fig = px.bar(temp_neutral, x="count", y="Common_words", title='Most Commmon Neutral Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
fig = px.treemap(temp_neutral, path=['Common_words'], values='count',title='Tree Of Most Common Neutral Words')
fig.show()

In [None]:
raw_text = [word for word_list in sick_data['temp_list1'] for word in word_list]

In [None]:
def words_unique(entailment_judgment,numwords,raw_words):
    '''
    Input:
        segment - Segment category (ex. 'Neutral');
        numwords - how many specific words do you want to see in the final result; 
        raw_words - list  for item in train_data[train_data.segments == segments]['temp_list1']:
    Output: 
        dataframe giving information about the name of the specific ingredient and how many times it occurs in the chosen cuisine (in descending order based on their counts)..

    '''
    allother = []
    for item in sick_data[sick_data.entailment_judgment != entailment_judgment]['temp_list1']:
        for word in item:
            allother.append(word)
    allother  = list(set(allother ))
    
    specificnonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in sick_data[sick_data.entailment_judgment == entailment_judgment]['temp_list1']:
        for word in item:
            mycounter[word] += 1
    keep = list(specificnonly)
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word]
    
    Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
    
    return Unique_words

In [None]:
Unique_Positive= words_unique('ENTAILMENT', 20, raw_text)
print("The top 20 unique words in Entailment Sentences are:")
Unique_Positive.style.background_gradient(cmap='Greens')

In [None]:
fig = px.treemap(Unique_Positive, path=['words'], values='count',title='Tree Of Unique Positive Words')
fig.show()

In [None]:
Unique_Negative= words_unique('CONTRADICTION', 10, raw_text)
print("The top 10 unique words in Contradicting Words are:")
Unique_Negative.style.background_gradient(cmap='Reds')

In [None]:
Unique_Neutral= words_unique('NEUTRAL', 10, raw_text)
print("The top 10 unique words in Neutral Tweets are:")
Unique_Neutral.style.background_gradient(cmap='Oranges')