Adapted from Juhee's code

In [1]:
# import libraries

import pandas as pd
import numpy as np
import re

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [2]:
# read in the clean dataset

df = pd.read_csv('../data/cleaned.csv')

In [3]:
print(df.shape)

df.head(3)

(19768, 6)


Unnamed: 0,id,text,user_name,date,lat,long
0,1303845642016497664,Oregon governor says towns have been ‘substant...,maxwell18191708,2020-09-09 23:59:59+00:00,35.369139,-121.033126
1,1304359517552095235,96% Overwhelmingly Positive Reviews! Grab a fr...,JoinDeepRock,2020-09-11 10:01:57+00:00,35.507931,-121.352019
2,1303845637851471872,‘Catastrophic’ wildfires may be deadliest in O...,SayWHARadio,2020-09-09 23:59:59+00:00,36.39038,-118.941569


In [4]:
## count sentences in each tweet (row)

# set up an emty list for sentence counts

n_sentence = []

# iterate through the text column

for i in range(len(df['text'])):
    
    n_sentence.append(len(sent_tokenize(df['text'][i].lower())))
    
# store it in the dataframe

df['n_sentence'] = n_sentence

In [5]:
# instantiate lemmatizer and tokenizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [6]:
## tokenize and lemmatize

# create a column 'text_clean'

df['text_clean'] = ''

# iterate through each row in the column text_all
    
for i in range(len(df['text'])):
        
    # tokenize each word in text into its own string
    text_token = []
    text_token.extend(tokenizer.tokenize(df['text'][i].lower()))
    text_tokens = []
    [text_tokens.append(word) for word in text_token if word not in text_tokens]
        
    # lemmatize the words
    text_lemmatize = []
    for j in range(len(text_tokens)):
        text_lemmatize.append(lemmatizer.lemmatize(text_tokens[j]))
        
    # remove characters and numbers
    clean_text = []
    for k in range(len(text_lemmatize)):
        clean_text.append(re.sub('[^a-zA-Z]', '', text_lemmatize[k]))    
        
    # group them together
    texts_collection = [text for text in clean_text]
    
    # put the words back to one long string for vectorization
    texts_collection = ' '.join(texts_collection)

    # fill new column
    df['text_clean'][i] = texts_collection

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
## count words in each tweet (row)

# set up an empty list for word counts

n_words = []

# iterate through the text column

for i in range(len(df['text_clean'])):
    
    word_tokens = tokenizer.tokenize(df['text_clean'][i])
    cnt = len(word_tokens)
    n_words.append(cnt)
    
# store it in the dataframe

df['n_words'] = n_words

In [8]:
df.head()

Unnamed: 0,id,text,user_name,date,lat,long,n_sentence,text_clean,n_words
0,1303845642016497664,Oregon governor says towns have been ‘substant...,maxwell18191708,2020-09-09 23:59:59+00:00,35.369139,-121.033126,1,oregon governor say town have been substantial...,19
1,1304359517552095235,96% Overwhelmingly Positive Reviews! Grab a fr...,JoinDeepRock,2020-09-11 10:01:57+00:00,35.507931,-121.352019,2,overwhelmingly positive review grab a frien...,15
2,1303845637851471872,‘Catastrophic’ wildfires may be deadliest in O...,SayWHARadio,2020-09-09 23:59:59+00:00,36.39038,-118.941569,1,catastrophic wildfire may be deadliest in oreg...,11
3,1303845628544520193,"SFGate: Thick wildfire smoke blocks sun, turns...",rankstr,2020-09-09 23:59:56+00:00,34.979422,-119.251991,1,sfgate thick wildfire smoke block sun turn b...,13
4,1303845625713188865,Still having a hard time processing that I had...,zach_wilk,2020-09-09 23:59:56+00:00,37.455476,-121.470958,3,still having a hard time processing that i had...,37


In [9]:
# reorder the order of the columns

df = df[['id', 'user_name','lat', 'long', 'date', 'text', 'text_clean', 'n_sentence', 'n_words']]

In [10]:
df.head()

Unnamed: 0,id,user_name,lat,long,date,text,text_clean,n_sentence,n_words
0,1303845642016497664,maxwell18191708,35.369139,-121.033126,2020-09-09 23:59:59+00:00,Oregon governor says towns have been ‘substant...,oregon governor say town have been substantial...,1,19
1,1304359517552095235,JoinDeepRock,35.507931,-121.352019,2020-09-11 10:01:57+00:00,96% Overwhelmingly Positive Reviews! Grab a fr...,overwhelmingly positive review grab a frien...,2,15
2,1303845637851471872,SayWHARadio,36.39038,-118.941569,2020-09-09 23:59:59+00:00,‘Catastrophic’ wildfires may be deadliest in O...,catastrophic wildfire may be deadliest in oreg...,1,11
3,1303845628544520193,rankstr,34.979422,-119.251991,2020-09-09 23:59:56+00:00,"SFGate: Thick wildfire smoke blocks sun, turns...",sfgate thick wildfire smoke block sun turn b...,1,13
4,1303845625713188865,zach_wilk,37.455476,-121.470958,2020-09-09 23:59:56+00:00,Still having a hard time processing that I had...,still having a hard time processing that i had...,3,37


In [11]:
# check for null values

df.isnull().sum()

id            0
user_name     0
lat           0
long          0
date          0
text          0
text_clean    0
n_sentence    0
n_words       0
dtype: int64

In [12]:
# save this dataframe to a csv file

df.to_csv('../data/final.csv', index = False)