In [57]:
import csv

def extract_id_and_location(input_file, output_file):
    try:
        # Open the input CSV file
        with open(input_file, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            
            # Open the output CSV file
            with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
                # Define the column names for the output CSV
                fieldnames = ['id', 'location']
                writer = csv.DictWriter(outfile, fieldnames=fieldnames)
                
                # Write the header
                writer.writeheader()
                
                # Extract 'id' and 'location' and write to output file
                for row in reader:
                    # Write only the id and location columns
                    writer.writerow({'id': row['id'], 'location': row['location']})
        
        print(f"Data has been successfully written to {output_file}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = 'input.csv'  # Path to your input CSV file
output_file = 'output.csv'  # Path to your output CSV file
extract_id_and_location(input_file, output_file)


Data has been successfully written to location.csv


In [208]:
import pandas as pd

data = pd.read_csv("mergedHashtag.csv")

In [209]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   target    7613 non-null   int64 
 4   text      7613 non-null   object
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [211]:
data['text'][4]

'just got sent this photo from ruby  as smoke from  pours into a school  wildfire'

In [160]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /home/nt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/nt/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/nt/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/nt/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
%pip install -U nltk
%python -m nltk.downloader popular
%pip install -U pywsd

In [212]:
import re
import string
import json

from nltk.corpus import stopwords, words as nltk_words
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_words = set(nltk_words.words())

with open('abbreviations.json', 'r') as f:
    contractions = json.load(f)


def expand_contractions(text):
    for contraction, full_form in contractions.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', full_form, text)

    return text


def split_hashtag(hashtag):
    hashtag = hashtag[1:]
    split_words = re.sub(r'([a-z])([A-Z])', r'\1 \2', hashtag)
    split_words = re.sub(r'[^a-zA-Z]', ' ', split_words)
    return split_words


def is_english_word(word):
    return word.lower() in english_words

In [223]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

def preprocess_text(text):
    text = text.lower()
    

    text = expand_contractions(text)
    

    text = re.sub(r'@\w+', '', text)
    # bỏ @...
    

    text = re.sub(r'http[s]?://\S+', '', text)
    # bỏ link
    

    text = text.translate(str.maketrans('', '', string.punctuation))
    # bỏ dấu câu và ký tự đặc biệt
    

    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # bỏ ký tự # unicode
    

    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    # bỏ email
    


    # Tokenization
    tokens = word_tokenize(text)

    filtered_tokens = []

    for word in tokens:

        if len(word) == 1:
            continue

        # Initialize a flag to check if we found a valid lemmatized word
        found_valid_word = False

        # Check lemmatization for all possible POS tags
        for pos in [wn.VERB, wn.ADJ, wn.ADV, wn.NOUN]:
            lemmatized_token = lemmatizer.lemmatize(word, pos=pos)

            # Check if the lemmatized token is an English word
            if is_english_word(lemmatized_token) and lemmatized_token not in stop_words:
                filtered_tokens.append(lemmatized_token)
                found_valid_word = True
                break  # Exit the loop since we found a valid word

        # If no valid word was found, you could also check the original word if needed
        if not found_valid_word and is_english_word(word) and word not in stop_words:
            filtered_tokens.append(word)


    return ' '.join(filtered_tokens)

# Chuyển từ số nhiều về số ít

In [224]:
print(preprocess_text('flooding'))

flood


In [225]:
import re

def preProcessing_location(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters while preserving whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing whitespace
    text = text.strip()

    return text


In [226]:
data['location'] = data['location'].fillna('')
data['keyword'] = data['keyword'].fillna('')

In [227]:
data['text'] = data['text'].apply(preprocess_text)

In [228]:
data['text'][4]

'get send photo ruby smoke pour school wildfire'

In [229]:
data['location'] = data['location'].apply(preProcessing_location)

In [230]:
data['text'][4]

'get send photo ruby smoke pour school wildfire'

In [231]:
data.to_csv('preprocessed_data.csv', index=False)

In [None]:
def proprocessing_location(text):
    text = text.lower()

    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    # bỏ email

    text = re.sub(r'@\w+', '', text)
    # bỏ @...
    text = re.sub(r'http[s]?://\S+', '', text)
    # bỏ link

    text = text.translate(str.maketrans('', '', string.punctuation))
    # bỏ dấu câu và ký tự đặc biệt
    text = re.sub(r'\#\w+', '', text)
    # bỏ hashtag (#)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # bỏ ký tự # unicode
   


In [23]:
import re
import string
import json

def preProcessing_text(text):
    text = text.lower()

    text = re.sub(r'\#\w+', '', text)
    # bỏ hashtag (#)
    return text
   


In [31]:
print(preProcessing_text("kjvkdfhgfkrefkeh #hfewhfekfk fwehfjewh #ewrwwe"))

kjvkdfhgfkrefkeh  fwehfjewh 


In [32]:
data['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [33]:
data['text'] = data['text'].apply(preProcessing_text)

output_file = 'removed_hashtag.csv'
data.to_csv(output_file, index=False)

print("Data has been cleaned and saved to", output_file)

Data has been cleaned and saved to removed_hashtag.csv


In [31]:
lemmatizer.lemmatize('survived', pos='v') in english_words

True

In [32]:
data['cleaned_text'] = data['text'].apply(preprocess_text)
data['location'] = data['location'].apply(preprocess_text)

output_file = 'cleaned_data.csv'
data[['id', 'keyword', 'location', 'cleaned_text', 'target']].to_csv(output_file, index=False)

print("Data has been cleaned and saved to", output_file)

Data has been cleaned and saved to cleaned_data.csv


bỏ trùng