In [1]:
import pandas as pd
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from langdetect import detect

In [2]:
# Load spaCy English model
# RUN 'python -m spacy download en_core_web_sm' in the terminal first
nlp = spacy.load('en_core_web_sm')

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amenhasfaw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amenhasfaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv('/Users/amenhasfaw/Downloads/geotagged_tweets_p_4')

In [None]:
# Filter columns based on data type
textual_columns = df.select_dtypes(include=['object']).columns

# Extract textual data
textual_data = df[textual_columns]


In [None]:
# Remove duplicates
textual_data = textual_data.drop_duplicates()

In [None]:
# Function to clean text
def clean_text(text):
    # Remove URLs, mentions, special characters, etc.
    cleaned_text = ' '.join(word for word in text.split() if not (word.startswith('http') or word.startswith('@')))
    return cleaned_text

# Apply text cleaning
textual_data = textual_data.apply(clean_text)

In [None]:
# Function to filter non-English tweets
def filter_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Filter non-English tweets
textual_data = textual_data[textual_data.apply(filter_english)]

In [None]:
# Function to tokenize text, remove stopwords, and stem words
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

# Apply preprocessing
textual_data = textual_data.apply(preprocess_text)

In [None]:
# Save cleaned textual data to a new dataset
textual_data.to_csv('cleaned_textual_data.csv', index=False)