## **Read simple text file**

In [13]:
%%writefile myfile.txt

'''
Hello, this is a simple text file.
It contains a few lines of text.
'''

Overwriting myfile.txt


In [14]:
my_text = open("myfile.txt", "r", encoding="utf-8")

In [15]:
print(my_text)

<_io.TextIOWrapper name='myfile.txt' mode='r' encoding='utf-8'>


In [17]:
# Read the content of 'myfile.txt' into a string
with open('myfile.txt', 'r') as file:
    file_content = file.read()
    print(file_content)


'''
Hello, this is a simple text file.
It contains a few lines of text.
'''



In [18]:
import io

# Use io.StringIO to create a file-like object from the string content
string_io_obj = io.StringIO(file_content)

# Now string_io_obj can be used like a file object for reading
print(string_io_obj.read())  # This will print the content of 'myfile.txt'


'''
Hello, this is a simple text file.
It contains a few lines of text.
'''



## **Read long text file with chunk**

In [37]:
# request the raw text of The Great Gatsby
# you will need to leverage the requests package
import requests
r = requests.get(r'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')
great_gatsby = r.text

In [38]:
great_gatsby



In [39]:
type(great_gatsby)

str

In [40]:
import io

# Simulate a large data stream
# Data(my_text) to be processed and written in chunks
data_stream = io.StringIO(great_gatsby)

In [41]:
# Open a file in write mode
with open('output.txt', 'w') as f:
    while True:
        # Read a chunk of data from the stream
        chunk = data_stream.read(10)  # reading 4 characters at a time

        if not chunk:
            break  # no more data in the stream

        # Process the chunk (for demonstration purposes, we'll just print it)
        print(chunk)

        # Write the processed chunk to the file (buffered by the OS or Python runtime)
        f.write(chunk)

# In the above example, the great_gatsby file will contain the data processed in chunks.

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


“Carraw
ay.”

“W
ell, I’m a
ll right n
ow. Where 
have they 
got Jimmy?
”

I too
k him into
 the drawi
ng-room, w
here his s
on lay, an
d left him

there. S
ome little
 boys had 
come up on
 the steps
 and were 
looking in
to
the ha
ll; when I
 told them
 who had a
rrived, th
ey went re
luctantly

away.


After a li
ttle while
 Mr. Gatz 
opened the
 door and 
came out, 
his mouth

ajar, his
 face flus
hed slight
ly, his ey
es leaking
 isolated 
and
unpun
ctual tear
s. He had 
reached an
 age where
 death no 
longer has
 the
qual
ity of gha
stly surpr
ise, and w
hen he loo
ked around
 him now f
or the
fi
rst time a
nd saw the
 height an
d splendou
r of the h
all and th
e great
r
ooms openi
ng out fro
m it into 
other room
s, his gri
ef began t
o be
mixe
d with an 
awed pride
. I helped
 him to a 
bedroom up
stairs; wh
ile he
to
ok off his
 coat and 
vest I tol
d him that
 all arran
gements ha
d been
de
ferred unt
il he came
.

“I di
dn

In [42]:
great_gatsby



### **Digression**

In [45]:
# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\d", "\t"]:
    great_gatsby = great_gatsby.replace(char, " ")

In [46]:
great_gatsby



In [47]:
# you can also subset for the book text
# (removing the project gutenburg introduction/footnotes)
great_gatsby = great_gatsby[1433:277912]
print(great_gatsby)



In [48]:
# print out some information about the text

# what's the data type of your text
print(f"the type of your data: {type(great_gatsby)}")

# how long is your text (in characters)?
print(f"length = {len(great_gatsby)} characters")

the type of your data: <class 'str'>
length = 276479 characters


In [49]:
# Which of your favorite characters is most mentioned?

# create an empty dict to keep track of mentions by character
reference_dict = {}
# create a list of characters
characters = ["daisy", "jay", "nick", "tom", "myrtle"]

# loop through each character to count their mentions
for character in characters:
    reference_dict[character] = great_gatsby.lower().count(character)

# turn your dictionary into a pandas dataframe and print it


import pandas as pd
df = pd.DataFrame(list(reference_dict.items()),
                 columns = ["character", "mentions"])
df = df.set_index("character")

df = df.sort_values(by = "mentions", ascending = False)
print("\n")
df





Unnamed: 0_level_0,mentions
character,Unnamed: 1_level_1
tom,219
daisy,186
nick,27
myrtle,23
jay,12
