-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataFormatter.py
136 lines (97 loc) · 4.11 KB
/
dataFormatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import csv
import re
from os import mkdir
from os.path import exists
from typing import List
from tqdm import tqdm # Import the tqdm module
from app.utils.configurationReader import Configuration
from app.utils.files import filesInDirectory
MENTION_PATTERN = r'@([a-zA-Z0-9-_]{1,})'
HASHTAG_PATTERN = r'#([a-zA-Z0-9-_]{1,})'
def formatLine(line: List[str]) -> None:
'''
This function formats a line of data
Parameters
----------
line: List[str]
A list of strings representing a line of data
Returns
-------
None
Example
-------
>>> formatLine(['1', '2020-01-01', '12:00:00', 'This is a tweet from @user1 mentioning @user2 #hashtag1 #hashtag2', 'https://www.google.com'])
['1', '2020-01-01', '12:00:00', 'this is a tweet mentioning', 'https://www.google.com', '@user1, @user2', '#hashtag1, #hashtag2']
'''
id, date, time, tweet, links, *_ = line # Get the data from the line and ignore the rest using the *_
extractedMentions = re.findall(MENTION_PATTERN, tweet) # Extract the mentions from the tweet
mentions = ','.join(extractedMentions) # combine the mentions into a string
tweet = re.sub(MENTION_PATTERN, '', tweet) # Remove the mentions from the tweet
extractedHashtags = re.findall(HASHTAG_PATTERN, tweet) # Extract the hashtags from the tweet
hashtags = ','.join(extractedHashtags) # combine the hashtags into a string
tweet = tweet.replace("#", "").replace("_", ' ') # Remove the hashtags from the tweet
tweet = tweet.replace(" : ", "").strip() # Remove the : from the tweet and trim the spaces
tweet = tweet.lower() # Convert the tweet to lower case
return [id, date, time, tweet, links, mentions, hashtags]
def formatFile(inputDirectory: str, outputDirectory: str, fileName: str) -> None:
'''
This function formats a file of data
Parameters
----------
inputDirectory: str
The directory of the input file
outputDirectory: str
The directory of the output file
fileName: str
The name of the file
Returns
-------
None
Example
-------
>>> formatFile('data/raw/', 'data/formatted/', 'tweets.csv')
# This will format the file tweets.csv in the data/raw/ directory and save it in the data/formatted/ directory
'''
with open(f'{ inputDirectory }/{ fileName }.csv', 'r', encoding='utf-8') as file: # Open the text file
reader = csv.reader(file)
next(reader) # Skip the first line
data = [formatLine(row) for row in tqdm(reader, desc=f'Formatting { fileName }')] # Format the data
data = [row for row in data if row is not None or row != []] # Remove the None values
with open(f'{ outputDirectory }/{ fileName }.csv', 'w', encoding='utf-8') as csvFile: # Open the csv file
writer = csv.writer(csvFile) # Create a csv.writer object
# Write the header
writer.writerow(['id', 'date', 'time', 'tweet', 'links', 'mentions', 'hashtags'])
# Write the data to the csv file (Note: the data is a list of lists of strings)
writer.writerows(data)
def main():
'''
This function formats the data
Parameters
----------
None
Returns
-------
None
Example
-------
>>> main()
# This will format the data
'''
config = Configuration.load_json("./config/formatting.json") # Load the config file
if not exists(config["inputDirectory"]): # Check if the input directory exists
print(f'The input directory { config["inputDirectory"] } does not exist\n')
return
if not exists(config["outputDirectory"]): # If the output directory does not exist create it
print(
f'The out directory { config["outputDirectory"] } does not exist, the script will create it for you, please wait...\n')
mkdir(config["outputDirectory"])
print('Finished creating the output directory\n')
files = filesInDirectory(config["inputDirectory"]) # Get the list of files in the input directory
print('Formatting Data...\n')
for file in tqdm(files, desc="Formating Files"): # tqdm is a progress bar
# Splits the file name and removes the extension
formatFile(config["inputDirectory"], config["outputDirectory"], file.split('.')[0])
print('\nFormatting is Done Successfully!') # Print a message to the console
pass
if __name__ == '__main__':
main()