In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

import glob
import regex 
import re, nltk
import emoji
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
wordnet_lemmatizer = WordNetLemmatizer()
from wordcloud import WordCloud, STOPWORDS

POS (Part of Speech) Tagging using SpaCy

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
# Or you can use the type() function to check the type of nlp
result = type(nlp) is spacy.language.Language

# Create an nlp object
doc = nlp("He went to play basketball")

# Iterate over the tokens
for token in doc:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.pos_)

He --> PRON
went --> VERB
to --> PART
play --> VERB
basketball --> NOUN


Dependency Parsing using spaCy

In [3]:
# Function to detect the date at the start of the line to make each row unique.
def DateTime(s):
    regex_format = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -' 
    result = re.match(regex_format, s)
    if result:
        return True
    return False

# Function to extract the group chat participant names
def Participants(a):
  a=a.split(":")
  if len(a)==2:
    return True
  else:
    return False

# Function to organize the data properly to be represented as a pandas dataframe
def GetDataPoints(line):   
    splitLine = line.split(' - ') 
    dateTime = splitLine[0]
    date, time = dateTime.split(', ') 
    message = ' '.join(splitLine[1:])
    if Participants(message): 
        splitMessage = message.split(': ') 
        participants = splitMessage[0] 
        message = ' '.join(splitMessage[1:])
    else:
        participants = None
    return date, time, participants, message

 # Function to extract the emojis and add them to a new column
def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

Read WhatsApp Data

In [4]:
# Folder path
parsedData=[]
data = [] 
text_data = '/Users/aiyinchen/Documents/NLU Project/WhatsApp Chat with Nic - Baddy Coaching.txt' # replace with the file name of your text data
with open(text_data, encoding="utf-8") as fp: #fp stands for file pointer and helps in reading in the data
    fp.readline()
    messageBuffer = [] 
    date, time, participants = None, None, None
    while True:
        line = fp.readline() 
        if not line: 
            break
        line = line.strip() 
        if DateTime(line): 
            if len(messageBuffer)> 0: 
                parsedData.append([date, time, participants, ' '.join(messageBuffer)]) 
            messageBuffer.clear() 
            date, time, participants, message = GetDataPoints(line) 
            messageBuffer.append(message) 
        else:
            messageBuffer.append(line)

In [5]:
clean_data = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Participants', 'Message']) # Reading in the data in a dataframe form
clean_data["Date"] = pd.to_datetime(clean_data["Date"])# making sure the date column is in a datetime format.  
clean_data["emoji"] = clean_data["Message"].apply(split_count) # Applying the emoji function to extract the emojis from the messages.

clean_data.head(20) # Having a look at the first 10 entries

Unnamed: 0,Date,Time,Participants,Message,emoji
0,2023-09-01,15:44,Aiyin üî•,"Hi Nic, this is Aiyin here.. We were in touch ...",[]
1,2023-09-01,15:45,Aiyin üî•,"I just checked, i didnt manage to book 9-10pm ...",[]
2,2023-09-01,15:52,Nic Baddy Coaching,"Hi Aiyin, 6-7pm is not a problem",[]
3,2023-09-01,15:53,Nic Baddy Coaching,My time is usually very flexible as long I don...,[]
4,2023-09-01,16:22,Aiyin üî•,Thats perfect! I'll bring shuttles for the day...,[]
5,2023-09-01,16:23,Nic Baddy Coaching,Thanks. Just want to improve on general play,[]
6,2023-09-01,16:24,Nic Baddy Coaching,Probably have a lot of bad habits. Struggle to...,[]
7,2023-09-01,16:26,Aiyin üî•,Ahh okayy lets see what we can work on for Wed...,[]
8,2023-09-01,16:27,Nic Baddy Coaching,Me too! Thanks,[]
9,2023-09-02,23:45,Nic Baddy Coaching,"Hi Aiyin, is this at the Moberly Sports Centre...",[]


Getting rid of non useful data

In [6]:
clean_data = clean_data[clean_data.Message != '<Media omitted>']
clean_data = clean_data[clean_data.Message != 'This message was deleted']
clean_data = clean_data[clean_data.Message != 'You were added'] 

Tokenizing, removing stop words and lemmatizing the texts.

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aiyinchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aiyinchen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aiyinchen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
messages = []
stop_words = set(stopwords.words('english'))

for words in clean_data['Message']:
    only_letters = re.sub("[^a-zA-Z]", " ",words)
    tokens = nltk.word_tokenize(only_letters) #tokenize the sentences
    lower_case = [l.lower() for l in tokens] #convert all letters to lower case
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case)) #Remove stopwords from the comments
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result] #lemmatizes the words i.e convert similar words to their
    # base form while still considering the context in which the words are used 
    
    messages.append(' '.join(lemmas))   

WordCloud Visualization

In [9]:
#Let's use worldcloud to visualize the messages
unique_string=(" ").join(messages)
wordcloud = WordCloud(width = 2000, height = 1000,font_path='/System/Library/Fonts/Supplemental/AmericanTypewriter.ttc',background_color='white').generate(unique_string)
plt.figure(figsize=(20,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

ValueError: Only supported for TrueType fonts

In [10]:
co = CountVectorizer(ngram_range=(2,2),stop_words='english')
counts = co.fit_transform(messages)
pd.DataFrame(counts.sum(axis=0),columns=co.get_feature_names()).T.sort_values(0,ascending=False).head(50)




Unnamed: 0,0
hi aiyin,3
hey nic,3
hi nic,3
leisure centre,3
message edited,2
nic able,2
finsbury leisure,2
nic yes,2
problem wednesday,1
pm wednesday,1


In [11]:
# dependency parsing
for token in doc:
    print(token.text, "-->", token.dep_)

He --> nsubj
went --> ROOT
to --> aux
play --> advcl
basketball --> dobj


In [12]:
for token in doc:
    # check token pos
    if token.pos_=='NOUN':
        # print token
        print(token.text)

basketball


In [13]:
from spacy import displacy 
displacy.render(doc, style='dep',jupyter=True)

Use Spark NLP to extract dates from text

In [14]:
import sparknlp
# Start Spark Session
spark = sparknlp.start()

23/09/27 09:50:12 WARN Utils: Your hostname, Ais-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.192.76 instead (on interface en0)
23/09/27 09:50:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/aiyinchen/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/aiyinchen/.ivy2/cache
The jars for the packages stored in: /Users/aiyinchen/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2d8ab8e6-024f-47cd-bd2c-30c010d0c62d;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guav

In [15]:
# obtain datetime values Whatsapp timestamp
import datetime
anchor_year = clean_data.Date[0].year
anchor_month = clean_data.Date[0].month
anchor_day = clean_data.Date[0].day


In [16]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    DateMatcher,
    MultiDateMatcher
)
import pyspark.sql.functions as F
# Step 1: Transforms raw texts to `document` annotation
document_assembler = (
    DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")
)
# Step 2: Extracts one date information from text
date = (
    DateMatcher()
    .setInputCols("document") 
    .setOutputCol("date") 
    .setOutputFormat("yyyy/MM/dd")
)
# Step 3: Extracts multiple date information from text. Set anchor day, month, year
anchorDate = (
    MultiDateMatcher()
    .setInputCols("document") 
    .setOutputCol("multi_date") 
    .setOutputFormat("MM/dd/yy")
    .setAnchorDateYear(anchor_year)
    .setAnchorDateMonth(anchor_month)
    .setAnchorDateDay(anchor_day)
)


nlpPipeline = Pipeline(stages=[document_assembler, date, anchorDate])
text_list = ["See you on next monday.",  
             "She was born on 02/03/1966.", 
             "The project started yesterday and will finish next year.", 
             "She will graduate by July 2023.", 
             "She will visit doctor tomorrow and next month again."]

In [115]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql.types import (
    StringType,
    TimestampType,
    StructType,
    StructField
)

# Create a dataframe
text_df = spark.createDataFrame(text_list, StringType()).toDF("text")
# Fit the dataframe and get predictions
result = nlpPipeline.fit(text_df).transform(text_df)
# Display the extracted date information in a dataframe
result.selectExpr("text","date.result as date", "multi_date.result as multi_date").show(truncate=False)

+-------------------------------------------------------------------------------------------------------+------------+----------+
|text                                                                                                   |date        |multi_date|
+-------------------------------------------------------------------------------------------------------+------------+----------+
|Hi Nic, this is Aiyin here.. We were in touch on Superprof                                             |[]          |[]        |
|I just checked, i didnt manage to book 9-10pm for Wednesday, only 6-7pm, would it be too early for you?|[2023/09/27]|[]        |
|Hi Aiyin, 6-7pm is not a problem                                                                       |[2023/09/27]|[]        |
|My time is usually very flexible as long I don‚Äôt have client meetings                                  |[]          |[]        |
|Thats perfect! I'll bring shuttles for the day, anything specific you'd like to work on

Try the NLU task on existing WhatsApp Messages

In [95]:
# Actual code
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql.types import (
    StringType,
    TimestampType,
    StructType,
    StructField, 
    ArrayType,
    MapType,
    FloatType
)
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit  # To create a constant column

# extract the messages from clean_data
messages = clean_data.Message
text_list = messages.values.tolist()
timestamp = clean_data.Date
date_list = timestamp.tolist()
date_list_df = pd.DataFrame({'timestamp': date_list})

# Create a dataframe
text_df = spark.createDataFrame(text_list, StringType()).toDF("text")
#timestamp_df = spark.createDataFrame(date_list, TimestampType()).toDF("timestamp")
# Define the schema for the PySpark DataFrame
schema = StructType([StructField("timestamp", TimestampType(), True)])
timestamp_df = spark.createDataFrame(date_list_df, schema=schema)

text_df.show()
timestamp_df.show()



+--------------------+
|                text|
+--------------------+
|Hi Nic, this is A...|
|I just checked, i...|
|Hi Aiyin, 6-7pm i...|
|My time is usuall...|
|Thats perfect! I'...|
|Thanks. Just want...|
|Probably have a l...|
|Ahh okayy lets se...|
|      Me too! Thanks|
|Hi Aiyin, is this...|
|Hey Nic, yes its ...|
|Hi Nic, you able ...|
|  Sure! See you then|
|Sorry running 10m...|
|No problem at all...|
|Just let them kno...|
|    We're on court 5|
|                 Ok.|
|       Just got here|
|Will get changed ...|
+--------------------+
only showing top 20 rows

+-------------------+
|          timestamp|
+-------------------+
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-01 00:00:00|
|2023-09-02 00:00:00|
|2023-09-03 00:00:00|
|2023-09-06 00:00:00|
|2023-09-06 00:00:00|
|2023-09-06 00:00:00|
|2023-09-06 00:00:00|
|2023-09-06 00:00:00|
|202

In [96]:
# Fit the dataframe and get predictions
#result = nlpPipeline.fit(text_df).transform(text_df)
# Display the extracted date information in a dataframe
#result.selectExpr("text","date.result as date", "multi_date.result as multi_date").show(truncate=False)
#result.selectExpr("text","timestamp").show(truncate=False)

# Add unique IDs to both DataFrames
text_df = text_df.withColumn("id", monotonically_increasing_id())
timestamp_df = timestamp_df.withColumn("id", monotonically_increasing_id())

# Join the two DataFrames using the common 'id' column
joined_df = text_df.join(timestamp_df, "id").drop("id")

# Define a UDF (User Defined Function) to extract anchor_day, anchor_month, and anchor_year
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

@udf(IntegerType())
def extract_day(timestamp):
    return timestamp.day

@udf(IntegerType())
def extract_month(timestamp):
    return timestamp.month

@udf(IntegerType())
def extract_year(timestamp):
    return timestamp.year

# Add columns for anchor_day, anchor_month, and anchor_year to the joined DataFrame
joined_df = joined_df.withColumn("anchor_day", extract_day(joined_df["timestamp"]))
joined_df = joined_df.withColumn("anchor_month", extract_month(joined_df["timestamp"]))
joined_df = joined_df.withColumn("anchor_year", extract_year(joined_df["timestamp"]))

joined_df.show()

+--------------------+-------------------+----------+------------+-----------+
|                text|          timestamp|anchor_day|anchor_month|anchor_year|
+--------------------+-------------------+----------+------------+-----------+
|Hi Nic, this is A...|2023-09-01 00:00:00|         1|           9|       2023|
|I just checked, i...|2023-09-01 00:00:00|         1|           9|       2023|
|Hi Aiyin, 6-7pm i...|2023-09-01 00:00:00|         1|           9|       2023|
|My time is usuall...|2023-09-01 00:00:00|         1|           9|       2023|
|Thats perfect! I'...|2023-09-01 00:00:00|         1|           9|       2023|
|Thanks. Just want...|2023-09-01 00:00:00|         1|           9|       2023|
|Probably have a l...|2023-09-01 00:00:00|         1|           9|       2023|
|Ahh okayy lets se...|2023-09-01 00:00:00|         1|           9|       2023|
|      Me too! Thanks|2023-09-01 00:00:00|         1|           9|       2023|
|Hi Aiyin, is this...|2023-09-02 00:00:00|         2

In [117]:
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.functions import lit

# Set the anchor day, month and year for each row.
temp = joined_df.select("anchor_day", "anchor_month", "anchor_year")

#anchor_day_list = joined_df.select("anchor_day").rdd.flatMap(lambda x: x).collect()
#anchor_month_list = joined_df.select("anchor_month").rdd.flatMap(lambda x: x).collect()
#anchor_year_list = joined_df.select("anchor_year").rdd.flatMap(lambda x: x).collect()

result_schema = StructType([
    StructField('text', StringType(), True),
    # Add more fields as needed based on your NLP pipeline output
    # For example, if your NLP pipeline outputs a column named 'result', you can add it like this:
    StructField('document', ArrayType(
        StructType([
            StructField('annotatorType', StringType(), True),
            StructField('begin', IntegerType(), True),
            StructField('end', IntegerType(), True),
            StructField('result', StringType(), True),
            StructField('metadata', MapType(StringType(), StringType()), True),
            StructField('embeddings', ArrayType(FloatType()), True)
        ])
    ), True),    
    StructField('date', ArrayType(
        StructType([
            StructField('annotatorType', StringType(), True),
            StructField('begin', IntegerType(), True),
            StructField('end', IntegerType(), True),
            StructField('result', StringType(), True),
            StructField('metadata', MapType(StringType(), StringType()), True),
            StructField('embeddings', ArrayType(FloatType()), True)
        ])
    ), True),
    StructField('multi_date', ArrayType(
        StructType([
            StructField('annotatorType', StringType(), True),
            StructField('begin', IntegerType(), True),
            StructField('end', IntegerType(), True),
            StructField('result', StringType(), True),
            StructField('metadata', MapType(StringType(), StringType()), True),
            StructField('embeddings', ArrayType(FloatType()), True)
        ])
    ), True)
])


# Create an empty list to store the results
result_list = []
result_df = spark.createDataFrame([], schema = result_schema)
# Iterate through each row in the DataFrame
for row in joined_df.rdd.collect():
    # Extract text and date information from the row
    text = [row["text"]]
    anchor_day = row["anchor_day"]
    anchor_month = row["anchor_month"]
    anchor_year = row["anchor_year"]
        
    text_df = spark.createDataFrame(text, StringType()).toDF("text")
    print(anchor_day)
    result = nlpPipeline.fit(text_df).transform(text_df)
    result.show()
    
    # Remove the header row (if it's the first row)
    if result.count() == 0:
        result = result.withColumn("text", lit("text"))
    
    result_df = result_df.union(result)
    

#result_df.show() 

result_df.selectExpr("text","date.result as date", "multi_date.result as multi_date").show(truncate=False)


                                                                                

1
+--------------------+--------------------+----+----------+
|                text|            document|date|multi_date|
+--------------------+--------------------+----+----------+
|Hi Nic, this is A...|[{document, 0, 57...|  []|        []|
+--------------------+--------------------+----+----------+

1
+--------------------+--------------------+--------------------+----------+
|                text|            document|                date|multi_date|
+--------------------+--------------------+--------------------+----------+
|I just checked, i...|[{document, 0, 10...|[{date, 41, 44, 2...|        []|
+--------------------+--------------------+--------------------+----------+

1
+--------------------+--------------------+--------------------+----------+
|                text|            document|                date|multi_date|
+--------------------+--------------------+--------------------+----------+
|Hi Aiyin, 6-7pm i...|[{document, 0, 31...|[{date, 12, 14, 2...|        []|
+-------

23/09/27 15:54:27 WARN DAGScheduler: Broadcasting large task binary with size 1494.3 KiB
23/09/27 15:54:27 WARN DAGScheduler: Broadcasting large task binary with size 1494.3 KiB
23/09/27 15:54:28 WARN DAGScheduler: Broadcasting large task binary with size 1494.3 KiB
23/09/27 15:54:28 WARN DAGScheduler: Broadcasting large task binary with size 1494.3 KiB
23/09/27 15:54:31 WARN DAGScheduler: Broadcasting large task binary with size 1494.3 KiB

+-------------------------------------------------------------------------------------------------------+------------+----------+
|text                                                                                                   |date        |multi_date|
+-------------------------------------------------------------------------------------------------------+------------+----------+
|Hi Nic, this is Aiyin here.. We were in touch on Superprof                                             |[]          |[]        |
|I just checked, i didnt manage to book 9-10pm for Wednesday, only 6-7pm, would it be too early for you?|[2023/09/27]|[]        |
|Hi Aiyin, 6-7pm is not a problem                                                                       |[2023/09/27]|[]        |
|My time is usually very flexible as long I don‚Äôt have client meetings                                  |[]          |[]        |
|Thats perfect! I'll bring shuttles for the day, anything specific you'd like to work on

                                                                                

In [55]:
# Set the anchor day, month and year for each row.
temp = joined_df.select("anchor_day", "anchor_month", "anchor_year")

anchor_day_list = joined_df.select("anchor_day").rdd.flatMap(lambda x: x).collect()
anchor_month_list = joined_df.select("anchor_month").rdd.flatMap(lambda x: x).collect()
anchor_year_list = joined_df.select("anchor_year").rdd.flatMap(lambda x: x).collect()

# Create an empty list to store the results
result_list = []

# Iterate through each row in the DataFrame
for row in joined_df.rdd.collect():
    # Extract text and date information from the row
    text = str(row["text"])
    anchor_day = row["anchor_day"]
    anchor_month = row["anchor_month"]
    anchor_year = row["anchor_year"]
    
    # Create a new Row with the extracted values
    single_row = Row(text=text, anchor_day=anchor_day, anchor_month=anchor_month, anchor_year=anchor_year)
    
    # Apply the NLP pipeline to the single row and collect the result
    result = nlpPipeline.fit(spark.createDataFrame([single_row])).transform(spark.createDataFrame([single_row]))
    
    # Append the result as a Row to the result list
    result_list.append(Row(**result.first().asDict()))

result_list.show()

# loop through each row of the pyspark Dataframe and apply nlpPipeline to "text"
# Define a function to drop the 'document' entry from a Row
def apply_pipeline(row):
    # Convert the Row to a dictionary
    row_dict = row.asDict()
    
    # Check if the "document" entry exists in the dictionary
    for "text" in row_dict:
        # Remove the "document" entry from the dictionary
        del row_dict["document"]
    
    if "date" in row_dict:
        # Typecast "date" to DateType()
        print()
        row_dict["date"] = lit(row_dict["date"]).cast(DateType())
        
    #if "multi_date" in row_dict:
        # Typecast "multi_date" to DateType()
        #row_dict["multi_date"] = row_dict["multi_date"].cast(DateType())
    
    
    # Create a new Row from the modified dictionary
    return Row(**row_dict)

# Use list comprehension to drop the "document" entry from each Row in the list
#new_result_list = [apply_pipeline(row) for row in result_list]



#print(anchor_day[0])
#temp.show()
#print(anchor_day)

SyntaxError: cannot assign to literal (143592066.py, line 40)

In [118]:
# Set the anchor day, month and year for each row.
print(joined_df.anchor_day[0])

# Fit the dataframe and get predictions
result = nlpPipeline.fit(joined_df).transform(joined_df)
# Display the extracted date information in a dataframe
result.selectExpr("text","date.result as date", "multi_date.result as multi_date").show(30,truncate=False)

Column<'anchor_day[0]'>
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--------------------+
|text                                                                                                                                                                                              |date        |multi_date          |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--------------------+
|Hi Nic, this is Aiyin here.. We were in touch on Superprof                                                                                                                                        |[]          |[]                  |
|I just checked, i didnt manage to book 9-10pm for W