# Basic Text Processing
### In this tutorial we will look into basics of raw string processing
## Creating String in python

In [1]:
# Creating a String  
# with single Quotes 
my_string = 'Hello learners'
print(my_string)

Hello learners


In [2]:
# Creating a String 
# with double Quotes 
my_string = "Hello learners"
print(my_string)

Hello learners


In [3]:
# Creating a String 
# with triple Quotes 
my_string = '''Hello learners'''
print(my_string)

Hello learners


In [4]:
# Creating String with triple 
# Quotes allows multiple lines 
my_string = """Hello, welcome to
Supervised Learning"""
print(my_string)

Hello, welcome to
Supervised Learning


In [5]:
print('Hello\nWorld')

Hello
World


In [6]:
# Raw String to ignore escape sequence
# Adding r before a string makes it raw, so it is printing \n as it is and not rendering it to newline character.
print(r'Hello\nWorld')

Hello\nWorld


### Accessing characters in a string and slicing it
__Indexing in python starts with 0 and for backward indexing it starts with -1__

In [7]:
string = 'SupervisedLearning'
print('string = ', string)

string =  SupervisedLearning


In [8]:
#first character
print('string[0] = ', string[0])

string[0] =  S


In [9]:
#last character
print('string[-1] = ', string[-1])

string[-1] =  g


In [10]:
#Second last character
print('string[-2] = ', string[-2])

string[-2] =  n


In [11]:
#slicing 2nd to 5th character
print('string[1:5] = ', string[1:5])

string[1:5] =  uper


In [12]:
#slicing 6th to 2nd last character
print('string[5:-2] = ', string[5:-2])

string[5:-2] =  visedLearni


In [13]:
#getting all the string characters skipping the every 2nd 
print('string[5:-2] = ', string[0:len(string):2])

string[5:-2] =  SprieLann


## Deleting a String

In [14]:
del string

##  Concatenation of Two or More Strings

In [15]:
str1 = 'Hello'
str2 ='Learners!'

In [16]:
# using +
print('str1 + str2 = ', str1 +" "+ str2)

str1 + str2 =  Hello Learners!


In [17]:
# using *
print('str1 * 3 =', (str1 +" ")  * 3)

str1 * 3 = Hello Hello Hello 


In [18]:
# using .join
" ".join([str1, str2])

'Hello Learners!'

## String Membership Test

In [19]:
'N' in 'NLP'

True

In [20]:
'Y' in 'NLP'

False

In [21]:
'NL' in 'NLP'

True

## Formatting of Strings

In [22]:
# Default order 
String = "{} {} {}".format('Language', 'Natural', 'Processing') 
print("Print String in default order: ") 
print(String) 

Print String in default order: 
Language Natural Processing


In [23]:
# Positional Formatting 
String = "{1} {0} {2}".format('Language', 'Natural', 'Processing') 
print("Print String in Positional order: ") 
print(String) 

Print String in Positional order: 
Natural Language Processing


In [24]:
# Keyword Formatting 
String1 = "{n} {l} {p}".format(l= 'Language', n= 'Natural', p= 'Processing') 
print("Print String in order of Keywords: ") 
print(String1) 

Print String in order of Keywords: 
Natural Language Processing


### f-Strings: A New and Improved Way to Format Strings in Python

In [25]:
name = "Tushar"
age = 23
print(f"Hello, {name}. You are {age} years old.")

Hello, Tushar. You are 23 years old.


In [26]:
# It would also be valid to use a capital letter F
print(F"Hello, {name}. You are {age}.")

Hello, Tushar. You are 23.


In [27]:
print(f"{2 * 37}")

74


##### The f in f-strings may as well stand for "fast".

In [28]:
import timeit
string = """
name = 'Anjan'
age = 26
'{} is {}.format(name, age)'
"""
timeit.timeit(string, number = 10000)

0.0003961530000005098

In [29]:
import timeit
string = """
name = 'Anjan'
age = 26
f'{name} is {age}'
"""
timeit.timeit(string, number = 10000)

0.0025784850000007964

In [30]:
# Spacing and padding in f strings
library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

for book in library:
    print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')

Author     Topic    Pages  
Twain      Rafting      601
Feynman    Physics       95
Hamilton   Mythology     144


## String Functions

In [31]:
import pandas as pd

In [33]:
string_functions = pd.read_html('https://www.programiz.com/python-programming/methods/string')

In [35]:
string_functions = string_functions[0]

In [36]:
pd.set_option('display.max_rows', 100)

In [37]:
string_functions

Unnamed: 0,Method,Description
0,Python String capitalize(),Converts first character to Capital Letter
1,Python String center(),Pads string with specified character
2,Python String casefold(),converts to casefolded strings
3,Python String count(),returns occurrences of substring in string
4,Python String endswith(),Checks if String Ends with the Specified Suffix
5,Python String expandtabs(),Replaces Tab character With Spaces
6,Python String encode(),returns encoded string of given string
7,Python String find(),Returns the index of first occurrence of subst...
8,Python String format(),formats string into nicer output
9,Python String index(),Returns Index of Substring


### Some most commonly used string functions

In [43]:
string= "supervisedlearning"
# Capitalizes the target strings first character.
print(string.capitalize())

Supervisedlearning


In [44]:
string= "SupervisedLearning"
# returns a copy of s with all alphabetic characters converted to lowercase
string.lower()

'supervisedlearning'

In [45]:
string= "natural language processing"
# returns a copy of s in which the first letter of each word is converted to uppercase and remaining letters are lowercase
string.title()

'Natural Language Processing'

In [46]:
string= "supervisedlearning"
string.upper()

'SUPERVISEDLEARNING'

In [47]:
string= "supervisedlearning"
# Determines whether the target string ends with a given substring.
string.endswith('g')

True

In [48]:
string= "supervisedlearning"
# Determines whether the target string starts with a given substring.
string.startswith('s')

True

In [53]:
# Determines whether the target string consists of alphabetic characters only.
'abc123'.isalpha()

False

In [55]:
#Determines whether the target string consists of digit characters only.
'123abc'.isdigit()

False

In [56]:
string= "Supervised Learning"
# Splits a string into a list of substrings.
string.split()

['Supervised', 'Learning']

In [57]:
string= "Supervised,Learning"
string.split(',')

['Supervised', 'Learning']

In [58]:
#with maxsplit=1 it splits for one 
'www.supervisedlearning.com'.split('.', maxsplit=1)

['www', 'supervisedlearning.com']

#### Explaining all those 67 string functions is out of the scope of this tutorial. So, just go through the above table once and you will be ready to play with strings.

## Working with Text Files

In [59]:
import os

In [60]:
# Get current working Directory
os.getcwd()

'E:\\Jupiter\\NLP by Prudhvi'

In [63]:
# Change current working directory
current_dir = os.getcwd()
os.chdir(path= current_dir)

#### Reading a text file

In [66]:
# Downloading using urlretrieve from http request
import urllib
urllib.request.urlretrieve("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt", "shakespeare.txt")

('shakespeare.txt', <http.client.HTTPMessage at 0xa74ca90>)

In [67]:
# Open the text.txt file
my_file = open('shakespeare.txt')

In [68]:
# We can now read the file
contents= my_file.read()

In [69]:
contents[:500]

'This is the 100th Etext file presented by Project Gutenberg, and\nis presented in cooperation with World Library, Inc., from their\nLibrary of the Future and Shakespeare CDROMS.  Project Gutenberg\noften releases Etexts that are NOT placed in the Public Domain!!\n\nShakespeare\n\n*This Etext has certain copyright implications you should read!*\n\n<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM\nSHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS\nPROVIDED BY PROJECT GUTENBERG ETEXT'

In [70]:
# But what happens if we try to read it again?
my_file.read()

''

This happens because you can imagine the reading "cursor" is at the end of the file after having read it. So there is nothing left to read. We can reset the "cursor" like this:

In [71]:
# Seek to the start of file (index 0)
my_file.seek(0)

0

In [73]:
# Now read again
my_file.read()[:1001]

'This is the 100th Etext file presented by Project Gutenberg, and\nis presented in cooperation with World Library, Inc., from their\nLibrary of the Future and Shakespeare CDROMS.  Project Gutenberg\noften releases Etexts that are NOT placed in the Public Domain!!\n\nShakespeare\n\n*This Etext has certain copyright implications you should read!*\n\n<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM\nSHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS\nPROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE\nWITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE\nDISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS\nPERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED\nCOMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY\nSERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>\n\n*Project Gutenberg is proud to cooperate with The World Library*\nin the presentation of The Complete Works of William Shakespeare\nfor your

In [74]:
# Readlines returns a list of the lines in the file
my_file.seek(0)
my_file.readlines()[:5]

['This is the 100th Etext file presented by Project Gutenberg, and\n',
 'is presented in cooperation with World Library, Inc., from their\n',
 'Library of the Future and Shakespeare CDROMS.  Project Gutenberg\n',
 'often releases Etexts that are NOT placed in the Public Domain!!\n',
 '\n']

In [75]:
my_file.close()

### Writing to a file

In [76]:
# Add a second argument to the function, 'w' which stands for write.
# Passing 'w+' lets us read and write to the file

my_file = open('test.txt','w+')

In [77]:
# Write to the file
my_file.write('This is a new first line')

24

In [78]:
# Read the file
my_file.seek(0)
my_file.read()

'This is a new first line'

### Appending to a text

In [79]:
my_file = open('test.txt','a+')
my_file.write('\nThis line is being appended to test.txt')
my_file.write('\nAnd another line here.')

23

In [80]:
my_file.seek(0)
my_file.read()

'This is a new first line\nThis line is being appended to test.txt\nAnd another line here.'

### Appending using IPython %%writefile magic function

In [81]:
%%writefile -a test.txt

This is more text being appended to test.txt
And another line here.

Appending to test.txt


In [82]:
my_file.seek(0)
my_file.read()

'This is a new first line\nThis line is being appended to test.txt\nAnd another line here.\nThis is more text being appended to test.txt\nAnd another line here.\n'

In [83]:
my_file.close()

#### You can assign temporary variable names as aliases, and manage the opening and closing of files automatically using a context manager:

In [84]:
with open('test.txt','r') as txt:
    first_line = txt.readlines()[0]
    
print(first_line)

This is a new first line



## Working with PDFs

In [85]:
import urllib
urllib.request.urlretrieve("https://web.stanford.edu/~jurafsky/slp3/2.pdf", 'SLP.pdf')

('SLP.pdf', <http.client.HTTPMessage at 0xb056828>)

### Working with PyPDF2

In [86]:
!pip install PyPDF2



In [87]:
import PyPDF2

In [88]:
# Notice we read it as a binary with 'rb'
f = open('SLP.pdf','rb')

In [89]:
pdf_reader = PyPDF2.PdfFileReader(f)

In [90]:
#Number of pages
pdf_reader.numPages

30

In [91]:
page_one = pdf_reader.getPage(0)

In [92]:
# Extracting text from PyPDF2
page_one_text = page_one.extractText()

In [93]:
page_one_text

"SpeechandLanguageProcessing.DanielJurafsky&JamesH.Martin.Copyright\nc\n\n2019.All\nrightsreserved.DraftofOctober2,2019.\nCHAPTER\n2\nRegularExpressions,Text\nNormalization,EditDistance\nUser:Iamunhappy.\nELIZA:DOYOUTHINKCOMINGHEREWILLHELPYOUNOTTOBEUNHAPPY\nUser:Ineedsomehelp,thatmuchseemscertain.\nELIZA:WHATWOULDITMEANTOYOUIFYOUGOTSOMEHELP\nUser:PerhapsIcouldlearntogetalongwithmymother.\nELIZA:TELLMEMOREABOUTYOURFAMILY\nUser:Mymothertakescareofme.\nELIZA:WHOELSEINYOUFAMILYTAKESCAREOFYOU\nUser:Myfather.\nELIZA:YOURFATHER\nUser:Youarelikemyfatherinsomeways.\nWeizenbaum(1966)\nThedialogueaboveisfrom\nELIZA\n,anearlynaturallanguageprocessingsystem\nELIZA\nthatcouldcarryonalimitedconversationwithauserbyimitatingtheresponsesof\naRogerianpsychotherapist\n(Weizenbaum,1966)\n.ELIZAisasurprisinglysimple\nprogramthatusespatternmatchingtorecognizephraseslikeﬁIneedXﬂandtranslate\nthemintosuitableoutputslikeﬁWhatwoulditmeantoyouifyougotX?ﬂ.This\nsimpletechniquesucceedsinthisdomainbecauseELIZAdoesn'

In [94]:
f.close()