In [1]:
!pip install regex



In [2]:
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
data = pd.read_csv(r"/Users/ayeshaqureshi/Downloads/Data.csv")
data.shape

(7339, 1)

In [4]:
def date_no_alphabet(data):
    text =[' '.join(item) for item in data.values.tolist()]
    num_matches = 0
    dates = []
    for string in text:
        string = string.lower()
        match = re.search(r'\b((\d{1,2}[-/\\])?\d{1,2}[-/\\]\d{4}|\d{4}[-/\\]\d{1,2}([-/\\]\d{1,2})?)\b',string)
        if match:
            num_matches+=1
            dates.append(match.group())
    return num_matches,set(dates)

number_dates,match = date_no_alphabet(data)
print('Number of Records: ',number_dates,'\nValue Matched: ',match)

Number of Records:  16 
Value Matched:  {'2015-19'}


Considering dates of the form:

- dd/mm/yyyy
- dd/mm/yy
- mm/dd/yyyy
- mm/dd/yy
- mm/yy
- mm/yyyy
- yyyy/mm/dd
- yyyy/dd/mm
- yy/mm/dd
- yy/dd/mm
- yyyy/mm
- yy/mm

In [5]:
def w_word_count(data):
    text =[' '.join(item) for item in data.values.tolist()]
    count=0
    for string in text:
        string = string.lower()
        match = re.findall(r'w[a-z0-9]+',string)
        if match:
            count+=1
    return count

w_words = w_word_count(data)
print('Number of words starting with w: ',w_words)

Number of words starting with w:  4958


In [6]:
data_list = data['Data'].astype(str).tolist()

In [7]:
def s_with_alphabet(data_list):
    count = 0
    url_pattern = r'(https?:\/\/|www\.)?[a-zA-Z0-9]+\.[^\s]{2,}'
    word_pattern = r'[a-zA-Z][a-zA-Z0-9]*'

    for i in data_list:
        excl_url = re.sub(url_pattern,'', i)

        match =  re.findall(word_pattern,excl_url)
        # print(match)
        count+=len(match)
    return count

print('Number of records with words start with an alphabet and are not URLs:', s_with_alphabet(data_list))

Number of records with words start with an alphabet and are not URLs: 115835


In [8]:
def emoji(data_list):
    reg = r'(:\)|:D|;\)|:P)'
    count=0

    for i in data_list:
        if re.findall(reg, i):
            count+=1
    return count

print('Number of tweets contain these emojis :), :D, ;), :P:', emoji(data_list))

Number of tweets contain these emojis :), :D, ;), :P: 18


In [9]:
def decimal_number_count(data):
    count = 0

    # Iterate through each string in the list
    for string in data_list:
        # searches for record starting with decimal number
        if re.search(r'\b\d+\.\d+\b', string):
            count += 1  # Increment count by 1 if there is a match

    return count

decimal_numbers = decimal_number_count(data)
print('Number of records containing a decimal number: ', decimal_numbers)

Number of records containing a decimal number:  25


In [10]:
def IP_address_count(data):
    count = 0

    # Iterate through each string in the list
    for string in data_list:
        # searches for record with IP address
        if re.search(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', string):
            count += 1  # Increment count by 1 if there is a match

    return count

IP_address = IP_address_count(data)
print('Number of records containingIP address: ', IP_address)

Number of records containingIP address:  0


In [11]:
def newline_records(data):
    count = 0
    for a in data_list:
        if '\n' in a:
            count += 1
    return count
print('Number of newline records',newline_records(data))

Number of newline records 1211


In [12]:
def number_of_hashtags(data):
    count = 0
    hashtag_type = r'#\w+'
    for a in data_list:
        count += len(re.findall(hashtag_type, a)) #re.finall() finds all matches in each tweet
    return count
print('Number of hashtags record',number_of_hashtags(data))

Number of hashtags record 2924


What is the code to substitute all non-alphanumeric characters with a new line



In [14]:
def sub_data():

    with open('/content/NLP Ungraded.csv', 'r', encoding='utf-8') as content:
        data = content.read()

    # To substitute all the all non - numeric characters using (a-zA-Z0-9) with new line
    data1 = re.sub(r'[^a-zA-Z0-9]+', '\n', data)

    data1 = '\n'.join(line.strip() for line in data1.split('\n') if line.strip())

    # First 100 characters
    print(data1[:100])

sub_data()

Data
Watch
or
listen
live
weekdays
at
8
30am
MT
at
ryanjespersen
com
Subscribe
via
YouTube
or
your
f


What is the total number of URLs across all tweets?

In [15]:
def count_urls(data):
    url_exp = r'https?://[^\s]+' #expression to match the URLs
    count = 0
    for tweets in data: #Iterate over each row of the data
      count += len(re.findall(url_exp, tweets)) #Find the respective URLs in tweets
    return count
print('Number of URLs found:', count_urls(data_list))

Number of URLs found: 4


Perform stemming and lemmatization

In [17]:
def clean_text(token):
    cleaned_tokens = [word for word in token if word.isalpha()]
    return cleaned_tokens

In [21]:
df = pd.read_csv('/content/NLP Ungraded.csv')
df.shape

(7339, 1)

In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [22]:
def stemming(text):
    ps = PorterStemmer()
    initial_tokens = [] #to store all the tokens
    global stemmed_tokens
    stemmed_tokens = [] #to store stemmed tokens

    for t in text:
        tokens = nltk.tokenize.word_tokenize(t.lower()) #converting to lower case and tokenizing
        tokens = clean_text(tokens)
        initial_tokens.extend(tokens) #storing all the tokens
        stemmed_tokens.extend(ps.stem(token) for token in tokens) #performing stemming and storing stemmed tokens

    return len(set(initial_tokens)), len(set(stemmed_tokens)) #returning lengths of tokens

unique_before_stemming, unique_after_stemming = stemming(df['Data']) #function call
print(f"Unique tokens before stemming: {unique_before_stemming}")
print(f"Unique tokens after stemming: {unique_after_stemming}")

Unique tokens before stemming: 7119
Unique tokens after stemming: 5820


Perform lemmatization using NLTK lemmatizer. Count the number of unique words/tokens before and after lemmatization

In [24]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [25]:
def lemmatizing(text):
    lm = WordNetLemmatizer()
    initial_tokens = [] #to store all the tokens
    global lemmatized_tokens
    lemmatized_tokens = [] #to store lemmatized tokens

    for t in text:
        tokens = nltk.tokenize.word_tokenize(t.lower()) #converting to lower case and tokenizing
        tokens = clean_text(tokens)

        initial_tokens.extend(tokens) #storing all the tokens
        lemmatized_tokens.extend(lm.lemmatize(token) for token in tokens) #performing lemmatization and storing lemmatized tokens
    return len(set(initial_tokens)), len(set(lemmatized_tokens)) #returning lengths of tokens

unique_before_lemmatizing, unique_after_lemmatizing = lemmatizing(df['Data']) #function call
print(f"Unique tokens before lemmatizing: {unique_before_lemmatizing}") #printing results
print(f"Unique tokens after lemmatizing: {unique_after_lemmatizing}")

Unique tokens before lemmatizing: 7119
Unique tokens after lemmatizing: 6587


In [26]:
from collections import Counter

# Top 10 words after lemmatization
wordcount = dict(Counter(lemmatized_tokens))
sorted_values1 = list(sorted(wordcount.items(),key = lambda item: item[1],reverse=True))
sorted_values1[:10]

[('and', 3514),
 ('the', 2859),
 ('of', 2223),
 ('to', 2179),
 ('a', 1768),
 ('in', 1541),
 ('for', 1507),
 ('i', 1189),
 ('alberta', 1061),
 ('is', 923)]

In [27]:
# TOp 10 words after Stemming
wordcount = dict(Counter(stemmed_tokens))
sorted_values2 = list(sorted(wordcount.items(),key = lambda item: item[1],reverse=True))
sorted_values2[:10]

[('and', 3514),
 ('the', 2859),
 ('of', 2223),
 ('to', 2179),
 ('a', 1551),
 ('in', 1541),
 ('for', 1507),
 ('i', 1189),
 ('alberta', 1061),
 ('is', 923)]

In [28]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [29]:
def clean_text(token):
    cleaned_tokens = [word for word in token if word.isalpha()]          # keeping only aphabets
    cleaned_tokens = [word for word in cleaned_tokens if word not in stop_words]        # removing stopwords

    return cleaned_tokens

In [30]:
def stemming(text):
    ps = PorterStemmer()
    initial_tokens = [] #to store all the tokens
    global stemmed_tokens
    stemmed_tokens = [] #to store stemmed tokens

    for t in text:
        tokens = nltk.tokenize.word_tokenize(t.lower()) #converting to lower case and tokenizing
        tokens = clean_text(tokens)
        initial_tokens.extend(tokens) #storing all the tokens
        stemmed_tokens.extend(ps.stem(token) for token in tokens) #performing stemming and storing stemmed tokens

    return len(set(initial_tokens)), len(set(stemmed_tokens)) #returning lengths of tokens

unique_before_stemming, unique_after_stemming = stemming(df['Data']) #function call
print(f"Unique tokens before stemming: {unique_before_stemming}")
print(f"Unique tokens after stemming: {unique_after_stemming}")

Unique tokens before stemming: 6992
Unique tokens after stemming: 5707


In [31]:
def lemmatizing(text):
    lm = WordNetLemmatizer()
    initial_tokens = [] #to store all the tokens
    global lemmatized_tokens
    lemmatized_tokens = [] #to store lemmatized tokens

    for t in text:
        tokens = nltk.tokenize.word_tokenize(t.lower()) #converting to lower case and tokenizing
        tokens = clean_text(tokens)

        initial_tokens.extend(tokens) #storing all the tokens
        lemmatized_tokens.extend(lm.lemmatize(token) for token in tokens) #performing lemmatization and storing lemmatized tokens
    return len(set(initial_tokens)), len(set(lemmatized_tokens)) #returning lengths of tokens

unique_before_lemmatizing, unique_after_lemmatizing = lemmatizing(df['Data']) #function call
print(f"Unique tokens before lemmatizing: {unique_before_lemmatizing}") #printing results
print(f"Unique tokens after lemmatizing: {unique_after_lemmatizing}")

Unique tokens before lemmatizing: 6992
Unique tokens after lemmatizing: 6465


In [32]:
# Top 10 words after lemmatization
wordcount = dict(Counter(lemmatized_tokens))
sorted_values1 = list(sorted(wordcount.items(),key = lambda item: item[1],reverse=True))
sorted_values1[:10]

[('alberta', 1061),
 ('news', 604),
 ('canada', 559),
 ('outdoor', 549),
 ('community', 542),
 ('adventure', 516),
 ('calgary', 390),
 ('writer', 385),
 ('park', 380),
 ('love', 361)]

In [33]:
# TOp 10 words after Stemming
wordcount = dict(Counter(stemmed_tokens))
sorted_values2 = list(sorted(wordcount.items(),key = lambda item: item[1],reverse=True))
sorted_values2[:10]

[('alberta', 1061),
 ('outdoor', 682),
 ('news', 604),
 ('commun', 595),
 ('canada', 559),
 ('adventur', 530),
 ('travel', 459),
 ('follow', 446),
 ('love', 410),
 ('calgari', 390)]